This document contains all statistical analyses conducted for the manuscript.
All data to reproduce analysis can be found here: https://github.com/biocom-uib/CAUvsCYM
rm(list = ls())
knitr::opts_chunk$set(
fig.width=10,
out.width="50%",
fig.asp = 1,
fig.align="center",
echo = TRUE,
message = FALSE,
warning = FALSE,
hiline = TRUE,
cache=TRUE
)
options(knitr.kable.NA = '')
knitr::opts_knit$set(global.par=TRUE)
par(cex.main=0.9,cex.axis=0.8,cex.lab=0.8)
options(scipen = 999)
We load the necessary packages:
library(readxl)
library(readr)
library(dplyr)
library(tidyr)
library(knitr)
library(ggplot2)
library(grid)
library(tidyverse)
library(splines)
library(zCompositions)
library(compositions)
library(robCompositions)
library(easyCODA)
library(ALDEx2)
library(viridis)
library(plotly)
library(ComplexHeatmap)
library(stats)
library(dendextend)
library(RColorBrewer)
library(kableExtra)
library(vegan)
library(ecolTest)
library(factoextra)
library(coda4microbiome)
library(broom)
source("funcionsCODAMETACIRCLE.R")
opar=par()
library(flextable)
set_flextable_defaults(
font.family = "Arial", font.size = 10,
border.color = "gray", big.mark = "")
#Data and factors
datos <- read_table("Sulfur_data_28ago25.txt")
datos$Species<-as.factor(datos$Species)
datos$Treatment<-as.factor(datos$Treatment)
datos$Time<-factor(datos$Time, levels = c("T0", "T2"))
#Group summary for plotting
df_summary <- datos %>%
group_by(Time, Species, Treatment) %>%
summarise(
mean_sulfur = mean(Sulfur, na.rm = TRUE),
sd_sulfur = sd(Sulfur, na.rm = TRUE),
n = n(),
se_sulfur = sd_sulfur / sqrt(n)
) %>%
ungroup()
# --- Figure (as in the paper: means ± SE) ---
p_a<- ggplot() +
geom_jitter(data = datos,
aes(x = Time, y = Sulfur, color = Species,
shape = Treatment),
width = 0.1,
alpha = 0.4,
size = 1.2) +
geom_line(data = df_summary,
aes(x = Time,
y = mean_sulfur,
color = Species,
group = interaction(Species, Treatment),
linetype = Treatment),
size = 1) +
geom_point(data = df_summary,
aes(x = Time,
y = mean_sulfur,
color = Species,
shape = Treatment),
size = 3) +
geom_errorbar(data = df_summary,
aes(x = Time,
ymin = mean_sulfur - se_sulfur,
ymax = mean_sulfur + se_sulfur,
color = Species),
width = 0.1,
size = 1.2) +
scale_color_manual(values = c("CAU" = "#008ae6", "CYM" = "#ff751a")) +
labs(
x = "Time",
y = expression(paste("Sulfur (", mu, "M)")),
color = "Species",
shape = "Treatment",
linetype = "Treatment"
) +
theme_minimal() +
theme(axis.title.x = element_text(face = "bold"),
axis.title.y = element_text(face = "bold"),
axis.text.x = element_text(face = "bold"),
legend.title = element_text(face = "bold")
) +
coord_cartesian(ylim = c(400, 1400)) +
scale_y_continuous(
breaks = seq(400, 1400, by = 100)
)
p_a
The figure shows sulfur concentrations (mean ± SE) at baseline (T0) and after exposure (T2) for sediments colonized by Caulerpa (blue) and Cymodocea (orange) under control (solid lines) and heatwave (dashed lines) conditions. In both species, sulfur levels increased from T0 to T2. The increase was evident under both control and heatwave treatments in Caulerpa, whereas in Cymodocea the rise was more pronounced under heatwave conditions. Individual data points (faded symbols) illustrate the variability within groups.
To evaluate changes in sediment sulfur concentration, we applied a nonparametric testing framework. First, we examined within-group temporal changes (T0 vs T2) separately for each Species × Treatment using Wilcoxon tests, with p-values adjusted by the Benjamini–Hochberg procedure. We then compared treatments at T2 (Control vs Heatwave) within each species to assess whether heatwave exposure altered sulfur at the final time point. This two-step approach provides both temporal (within-group) and treatment-level (between-group) perspectives on sulfur dynamics.
We first report group medians at T0 and T2 and their difference (Δ = T2 − T0) for each Species × Treatment. These summaries are descriptive and motivate the subsequent nonparametric tests.
medians <- datos %>%
group_by(Species, Treatment, Time) %>%
summarise(median_sulfur = median(Sulfur, na.rm = TRUE), n = dplyr::n(), .groups = "drop") %>%
tidyr::pivot_wider(names_from = Time, values_from = median_sulfur) %>%
# Opción A: con comillas invertidas (recomendado)
mutate(delta_median = `T2` - `T0`)
# Opción B (alternativa robusta):
# mutate(delta_median = get("T2") - get("T0"))
knitr::kable(medians, digits = 2, caption = "Medians by group and change (T2 - T0)")
| Species | Treatment | n | T0 | T2 | delta_median |
|---|---|---|---|---|---|
| CAU | Control | 9 | 945.91 | 1151.37 | 205.46 |
| CAU | HW | 9 | 939.62 | 1284.93 | 345.31 |
| CYM | Control | 9 | 785.53 | 894.56 | 109.03 |
| CYM | HW | 9 | 835.85 | 1109.06 | 273.21 |
We then formally tested T0 vs T2 within each Species × Treatment using Wilcoxon rank–sum tests, controlling the false discovery rate with Benjamini–Hochberg adjustment.
time_tests <- datos %>%
dplyr::group_by(Species, Treatment) %>%
dplyr::summarise(
p = {
d <- dplyr::cur_data()
x <- d$Sulfur[d$Time == "T0"]
y <- d$Sulfur[d$Time == "T2"]
if (length(x) > 0 && length(y) > 0) {
suppressWarnings(stats::wilcox.test(x, y, exact = FALSE)$p.value)
} else {
NA_real_
}
},
.groups = "drop"
) %>%
dplyr::mutate(p_adj = p.adjust(p, method = "BH"))
knitr::kable(time_tests, digits = 3,
caption = "Wilcoxon (T0 vs T2) per Species × Treatment (BH-adjusted p)")
| Species | Treatment | p | p_adj |
|---|---|---|---|
| CAU | Control | 0.008 | 0.011 |
| CAU | HW | 0.001 | 0.002 |
| CYM | Control | 0.158 | 0.158 |
| CYM | HW | 0.004 | 0.007 |
These outputs provide a basis for the statement that sulfur increased in both species and that the rise was more pronounced under heatwave conditions—particularly for Cymodocea at T2.
To complement the nonparametric analyses and jointly assess the effects of time and treatment, a difference-in-differences (DiD) model was applied separately for each species. This approach makes it possible to test whether the temporal change in sulfur differs significantly between control and heatwave conditions, while accounting for the main effects of treatment and time. In particular, the interaction term (Treatment × Time) captures the differential effect attributable to the heatwave at the final time point (TF), thus providing a direct measure of the magnitude of the treatment impact in each species.
Cymodocea — sulfur analysis
cy <- datos %>%
dplyr::filter(Species == "CYM")
cy$Treatment <- factor(cy$Treatment, levels = c("Control", "HW"))
cy$Time <- factor(cy$Time, levels = c("T0", "T2"))
fit_cy <- lm(Sulfur ~ Treatment + Time + Treatment:Time, data = cy)
summary(fit_cy)
##
## Call:
## lm(formula = Sulfur ~ Treatment + Time + Treatment:Time, data = cy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -318.31 -118.50 32.97 95.49 266.60
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 720.20 56.15 12.826 0.0000000000000372 ***
## TreatmentHW 21.73 79.41 0.274 0.7861
## TimeT2 163.62 79.41 2.061 0.0476 *
## TreatmentHW:TimeT2 129.66 112.30 1.155 0.2568
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 168.5 on 32 degrees of freedom
## Multiple R-squared: 0.3877, Adjusted R-squared: 0.3303
## F-statistic: 6.754 on 3 and 32 DF, p-value: 0.001172
In Cymodocea, sulfur concentrations increased significantly from T0 to T2 under control conditions (Time effect: p = 0.048). The DiD model further estimated an additional increase under heatwave exposure at T2 (interaction coefficient = 129.7), consistent with the descriptive pattern of a more pronounced rise under heatwave conditions. However, this differential effect was not statistically significant (p = 0.26), indicating that while the data suggest a stronger increase in Cymodocea under heatwave exposure, statistical support for this effect remains moderate.
Caulerpa — sulfur analysis
ca <- datos %>%
dplyr::filter(Species == "CAU")
ca$Treatment <- factor(cy$Treatment, levels = c("Control", "HW"))
ca$Time <- factor(cy$Time, levels = c("T0", "T2"))
fit_ca <- lm(Sulfur ~ Treatment + Time + Treatment:Time, data = ca)
summary(fit_ca)
##
## Call:
## lm(formula = Sulfur ~ Treatment + Time + Treatment:Time, data = ca)
##
## Residuals:
## Min 1Q Median 3Q Max
## -242.770 -71.945 -7.951 83.563 226.415
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 955.346 40.921 23.346 < 0.0000000000000002 ***
## TreatmentHW -1.572 57.872 -0.027 0.978493
## TimeT2 222.755 57.872 3.849 0.000534 ***
## TreatmentHW:TimeT2 114.871 81.843 1.404 0.170080
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 122.8 on 32 degrees of freedom
## Multiple R-squared: 0.6131, Adjusted R-squared: 0.5769
## F-statistic: 16.91 on 3 and 32 DF, p-value: 0.0000009271
In Caulerpa, sulfur concentrations increased significantly from T0 to T2 under control conditions (Time effect: p < 0.001). The DiD model further estimated an additional increase under heatwave exposure at T2 (interaction coefficient = 114.9), consistent with the descriptive pattern of greater accumulation under heatwave conditions. However, this differential effect was not statistically significant (p = 0.17), indicating that while sulfur accumulation clearly occurred over time, the evidence for an extra increase driven by heatwave exposure remains moderate.
#Data
datos <- read_table("redox_data_12feb25.txt")
datos$Species<-as.factor(datos$Species)
datos$Treatment<-as.factor(datos$Treatment)
datos$Time<-factor(datos$Time, levels = c("T0", "T2"))
#Group summary for plotting
df_summary <- datos %>%
group_by(Time, Species, Treatment) %>%
summarise(
mean_redox = mean(redox, na.rm = TRUE),
sd_redox = sd(redox, na.rm = TRUE),
n = n(),
se_redox = sd_redox / sqrt(n)
) %>%
ungroup()
p_b <- ggplot() +
geom_jitter(data = datos,
aes(x = Time, y = redox, color = Species, shape = Treatment),
width = 0.1,
alpha = 0.4,
size = 1.2) +
geom_line(data = df_summary,
aes(x = Time,
y = mean_redox,
color = Species,
group = interaction(Species, Treatment),
linetype = Treatment),
size = 1) +
geom_point(data = df_summary,
aes(x = Time,
y = mean_redox,
color = Species,
shape = Treatment),
size = 3) +
geom_errorbar(data = df_summary,
aes(x = Time,
ymin = mean_redox - se_redox,
ymax = mean_redox + se_redox,
color = Species),
width = 0.1,
size = 1.2) +
scale_color_manual(values = c("CAU" = "#008ae6", "CYM" = "#ff751a")) +
labs(
x = "Time",
y = expression(paste("redox (",m, "M)")) ,
color = "Species",
shape = "Treatment",
linetype = "Treatment"
) +
theme_minimal() +
theme(axis.title.x = element_text(face = "bold"),
axis.title.y = element_text(face = "bold"),
axis.text.x = element_text(face = "bold"),
legend.title = element_text(face = "bold")
) +
coord_cartesian(ylim = c(-430, -330)) +
scale_y_continuous(
breaks = seq(-430, -330, by = 10)
)
p_b
The figure shows sediment redox potential (mean ± SE) at baseline (T0) and after exposure (T2) for sediments colonized by Caulerpa (blue) and Cymodocea (orange) under control (solid lines) and heatwave (dashed lines) conditions. In Caulerpa, redox potential declined significantly from T0 to T2 under control conditions but remained stable under heatwave conditions. In Cymodocea, redox potential also decreased over time, with a stronger decline under heatwave treatment.
To evaluate changes in sediment redox potential, we applied a nonparametric testing framework. First, we examined within-group temporal changes (T0 vs T2) separately for each Species × Treatment using Wilcoxon tests, with p-values adjusted by the Benjamini–Hochberg procedure. We then compared treatments at T2 (Control vs Heatwave) within each species to assess whether heatwave exposure altered redox potential at the final time point. This two-step approach provides both temporal (within-group) and treatment-level (between-group) perspectives on redox dynamics.
We present a nonparametric analysis of sediment redox potential. First, we assess within-group change from T0 to T2 for each Species × Treatment using Wilcoxon tests and compare treatments at T2 within each species. P-values are adjusted for multiple testing with the Benjamini–Hochberg procedure
eh_within <- datos %>%
dplyr::group_by(Species, Treatment) %>%
dplyr::summarise(
p = {
d <- dplyr::cur_data()
x <- d$redox[d$Time == "T0"]
y <- d$redox[d$Time == "T2"]
if (length(x) > 0 && length(y) > 0) {
suppressWarnings(stats::wilcox.test(x, y, exact = FALSE)$p.value)
} else NA_real_
},
.groups = "drop"
) %>%
dplyr::mutate(p_adj = p.adjust(p, method = "BH"))
knitr::kable(eh_within, digits = 3,
caption = "Wilcoxon (T0 vs T2) by Species × Treatment (BH-adjusted p)")
| Species | Treatment | p | p_adj |
|---|---|---|---|
| CAU | Control | 0.005 | 0.020 |
| CAU | HW | 0.575 | 0.575 |
| CYM | Control | 0.298 | 0.397 |
| CYM | HW | 0.045 | 0.091 |
These summaries show that redox potential generally decreased over time, with the largest declines occurring in Caulerpa under control conditions and in Cymodocea under heatwave conditions. These descriptive patterns motivated the subsequent nonparametric tests.
eh_t2_bt <- datos %>%
dplyr::filter(Time == "T2") %>%
dplyr::group_by(Species) %>%
dplyr::summarise(
p = {
d <- dplyr::cur_data()
if (dplyr::n_distinct(d$Treatment) >= 2) {
suppressWarnings(stats::wilcox.test(redox ~ Treatment, data = d, exact = FALSE)$p.value)
} else NA_real_
},
.groups = "drop"
) %>%
dplyr::mutate(p_adj = p.adjust(p, method = "BH"))
knitr::kable(eh_t2_bt, digits = 3,
caption = "Wilcoxon (Treatment) at T2 by Species (BH-adjusted p)")
| Species | p | p_adj |
|---|---|---|
| CAU | 0.298 | 0.298 |
| CYM | 0.045 | 0.091 |
Overall, redox potential tended to decrease from T0 to T2, but the magnitude and significance of these shifts varied across species and treatments. In Caulerpa, a significant decline was detected under control conditions (BH-adjusted p = 0.020), whereas no change was observed under heatwave conditions. In Cymodocea, redox potential also decreased over time, with a trend towards stronger reductions under heatwave exposure (raw p = 0.045), though this did not remain significant after correction (BH-adjusted p = 0.091). Comparisons between treatments at T2 revealed no differences for Caulerpa and only a marginal effect for Cymodocea (BH-adjusted p = 0.091).
We fitted difference-in-differences (DiD) models for each species. This framework evaluates whether changes in redox potential from T0 to T2 differed between control and heatwave conditions, while simultaneously accounting for the main effects of treatment and temporal progression. The key parameter of interest is the interaction term (Treatment × Time), which quantifies the additional effect attributable to heatwave exposure at the final sampling point (T2), thereby offering a direct estimate of the treatment-specific impact.
Cymodocea — redox analysis
fit_cy <- lm(Redox ~ Treatment + Time + Treatment:Time, data = cy)
summary(fit_cy)
##
## Call:
## lm(formula = Redox ~ Treatment + Time + Treatment:Time, data = cy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -42.133 -13.897 -2.578 10.992 58.667
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -367.567 6.886 -53.382 <0.0000000000000002 ***
## TreatmentHW 3.100 9.738 0.318 0.7523
## TimeT2 -20.711 9.738 -2.127 0.0412 *
## TreatmentHW:TimeT2 -19.089 13.771 -1.386 0.1753
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20.66 on 32 degrees of freedom
## Multiple R-squared: 0.4086, Adjusted R-squared: 0.3531
## F-statistic: 7.368 on 3 and 32 DF, p-value: 0.000689
In Cymodocea, redox potential declined significantly from T0 to T2 under control conditions (Time effect: –20.7, p = 0.041). The DiD model further estimated an additional reduction under heatwave exposure at T2 (interaction coefficient = –19.1), consistent with the descriptive observation of stronger declines under heatwave conditions. However, this interaction was not statistically significant (p = 0.18), indicating that while the data suggest a heatwave-driven intensification of the temporal decline in redox potential, the statistical support for this effect remains limited.
Caulerpa — redox analysis
fit_ca <- lm(Redox ~ Treatment + Time + Treatment:Time, data = ca)
summary(fit_ca)
##
## Call:
## lm(formula = Redox ~ Treatment + Time + Treatment:Time, data = ca)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28.3444 -11.2625 0.1944 7.6500 30.1556
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -392.850 4.974 -78.977 <0.0000000000000002 ***
## TreatmentHW -13.794 7.035 -1.961 0.0586 .
## TimeT2 -6.506 7.035 -0.925 0.3620
## TreatmentHW:TimeT2 1.817 9.948 0.183 0.8563
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.92 on 32 degrees of freedom
## Multiple R-squared: 0.2002, Adjusted R-squared: 0.1252
## F-statistic: 2.67 on 3 and 32 DF, p-value: 0.06413
In Caulerpa, redox potential showed no significant temporal change from T0 to T2 under control conditions (Time effect: –6.5, p = 0.36). The model estimated a slightly lower baseline under heatwave compared to control at T0 (Treatment effect: –13.8, p = 0.059), suggesting a marginal initial difference between treatments. However, the interaction term (Treatment × Time) was small and not significant (p = 0.86), indicating that heatwave exposure did not alter the temporal trajectory of redox potential. Overall, these results suggest that, unlike Cymodocea, the decline in redox observed in Caulerpa sediments was not exacerbated by heatwave conditions.
We analyzed cell counts in sediments colonized by either Caulerpa or Cymodocea under control and heatwave conditions, assessing temporal changes (T0–T1–T2) within each Species × Treatment group as well as differences between species.
#Data
datos <- read_table("cell_shanon_data_12_05_25.txt") %>%
mutate(
Species = factor(Species),
Time = factor(Time, levels = c("T0","T1","T2")),
Treatment = factor(Treatment),
cells = as.numeric(cells)
)
#Group summary for plotting
summary_data <- datos %>%
group_by(Species, Treatment, Time) %>%
summarise(
mean_cells = mean(cells, na.rm = TRUE),
sd = sd(cells, na.rm = TRUE),
n = n(),
se = sd / sqrt(n),
.groups = "drop"
)
t1_hw <- summary_data %>%
filter(Treatment == "HW", Time == "T1") %>%
select(Species, y = mean_cells)
t2_hw <- summary_data %>%
filter(Treatment == "HW", Time == "T2") %>%
select(Species, yend = mean_cells)
segments_hw <- left_join(t1_hw, t2_hw, by = "Species")
t12 <- summary_data %>% filter(Time %in% c("T1","T2"))
rango12 <- range(t12$mean_cells - t12$se, t12$mean_cells + t12$se)
ymin12 <- rango12[1] - 0.025 * diff(rango12)
ymax12 <- rango12[2] + 0.025 * diff(rango12)
p_c<- ggplot() +
geom_jitter(
data = datos,
aes(x = Time, y = cells, color = Species, shape = Treatment),
width = 0.1, alpha = 0.4, size = 1.2
) +
geom_line(
data = summary_data,
aes(x = Time, y = mean_cells,
color = Species,
linetype = Treatment,
group = interaction(Species, Treatment)),
size = 1
) +
geom_point(
data = summary_data,
aes(x = Time, y = mean_cells, color = Species, shape = Treatment),
size = 3
) +
geom_errorbar(
data = summary_data,
aes(x = Time,
ymin = mean_cells - se,
ymax = mean_cells + se,
color = Species),
width = 0.2, size = 1.25
) +
geom_segment(
data = summary_data,
aes(y = 53800000,
yend = 38700000,
color = "CYM"),
x = 2,
xend = 3,
linetype = "dashed",
size = 1.5
) +
geom_segment(
data = summary_data,
aes(y = 173333333,
yend = 212000000,
color = "CAU"),
x = 2,
xend = 3,
linetype = "dashed",
size = 1.5
) +
scale_color_manual(name = "Species",
values = c("CAU" = "#008ae6", "CYM" = "#ff751a")) +
scale_shape_manual(name = "Treatment",
values = c("Control" = 16, # circles filled
"HW" = 17)) +# triangles filled
scale_linetype_manual(name = "Treatment",
values = c("Control" = "solid",
"HW" = "dashed")) +
guides(
shape = guide_legend(override.aes = list(size = 3)),
linetype = guide_legend(override.aes = list(size = 1.5))
) +
labs(
x = "Time",
y = "Cells",
color = "Species",
shape = "Treatment",
linetype = "Treatment"
) +
theme_minimal(base_size = 12) +
theme(
axis.title.x = element_text(face = "bold"),
axis.title.y = element_text(face = "bold"),
axis.text.x = element_text(face = "bold"),
legend.title = element_text(face = "bold")
) +
coord_cartesian(ylim = c(ymin12, ymax12)) +
scale_y_continuous(
breaks = pretty(c(ymin12, ymax12), n = 5),
labels = expression(0,5%*%10^7,1%*%10^8,1.5%*%10^8,2%*%10^8,2.5%*%10^8),
expand = expansion(mult = c(0,0))
)
p_c
The figure shows cell counts in sediments colonized by Caulerpa (blue) and Cymodocea (orange) under control (solid lines) and heatwave (dashed lines) conditions across three time points (T0, T1, T2). In Caulerpa, cell numbers remained relatively stable from T0 to T1, followed by a marked increase under heatwave conditions at T2, while control samples decreased slightly. In contrast, Cymodocea exhibited consistently lower cell counts overall, with values remaining stable between T0 and T1 but declining toward T2 under both treatments, particularly under heatwave exposure.
We first quantified the overall range of cell counts observed in each species to provide context for the magnitude of the differences.
ranges <- datos %>%
group_by(Species) %>%
summarise(
min_cells = min(cells, na.rm = TRUE),
max_cells = max(cells, na.rm = TRUE),
.groups = "drop"
) %>%
mutate(min_x1e9 = round(min_cells / 1e9, 1),
max_x1e9 = round(max_cells / 1e9, 1))
knitr::kable(ranges, digits = 2, caption = "Species-wise ranges (×10^9 cells/g)")
| Species | min_cells | max_cells | min_x1e9 | max_x1e9 |
|---|---|---|---|---|
| CAU | 116000000 | 221000000 | 0.1 | 0.2 |
| CYM | 19800000 | 61900000 | 0.0 | 0.1 |
These ranges highlight the broader scale of variation in Caulerpa compared with Cymodocea, providing a baseline for interpreting subsequent tests.
To evaluate species-level differences, we first applied a Wilcoxon rank–sum test pooling across all time points and treatments.
# Overall (pooling all times/treatments)
bt_overall <- suppressWarnings(stats::wilcox.test(cells ~ Species, data = datos, exact = FALSE))
# By time (T0, T1, T2) — useful to show consistency
bt_by_time <- datos %>%
group_by(Time) %>%
summarise(
p = {
d <- cur_data()
suppressWarnings(stats::wilcox.test(d$cells ~ d$Species, exact = FALSE)$p.value)
},
.groups = "drop"
)
knitr::kable(bt_by_time, digits = 3, caption = "Between-species Wilcoxon by Time")
| Time | p |
|---|---|
| T0 | 0.081 |
| T1 | 0.081 |
| T2 | 0.005 |
bt_overall$p.value
## [1] 0.00003658455
This overall comparison indicated that species differ in cell counts.
To assess whether cell counts changed over time within each Species × Treatment combination, we first applied a nonparametric global test across the three sampling points (T0, T1, T2).
# Kruskal–Wallis per group
kw_time <- datos %>%
group_by(Species, Treatment) %>%
summarise(
kw_p = {
d <- cur_data()
if (n_distinct(d$Time) >= 2) {
suppressWarnings(kruskal.test(cells ~ Time, data = d)$p.value)
} else NA_real_
},
.groups = "drop"
)
# Pairwise Wilcoxon per group (BH across the 3 pairwise contrasts within each group)
pw_time <- datos %>%
group_by(Species, Treatment) %>%
group_modify(~{
d <- .x
# Need at least 2 time levels
if (n_distinct(d$Time) < 2) return(tibble())
pw <- pairwise.wilcox.test(d$cells, d$Time, p.adjust.method = "BH", exact = FALSE)
# Convert p-value matrix to long
M <- as.data.frame(as.table(pw$p.value))
names(M) <- c("Time1","Time2","p_adj")
M <- M %>% filter(!is.na(p_adj))
# Add direction using medians
meds <- d %>% group_by(Time) %>% summarise(med = median(cells, na.rm = TRUE), .groups="drop")
M <- M %>%
left_join(meds, by = c("Time1" = "Time")) %>% rename(med1 = med) %>%
left_join(meds, by = c("Time2" = "Time")) %>% rename(med2 = med) %>%
mutate(direction = case_when(
med2 > med1 ~ "increase",
med2 < med1 ~ "decrease",
TRUE ~ "no_change"
))
M
}) %>%
ungroup()
knitr::kable(kw_time, digits = 3, caption = "Kruskal–Wallis across T0–T1–T2 by Species × Treatment")
| Species | Treatment | kw_p |
|---|---|---|
| CAU | Control | 0.491 |
| CAU | HW | |
| CYM | Control | 0.193 |
| CYM | HW |
knitr::kable(pw_time, digits = 3, caption = "Pairwise Wilcoxon (BH-adjusted) and median directions")
| Species | Treatment | Time1 | Time2 | p_adj | med1 | med2 | direction |
|---|---|---|---|---|---|---|---|
| CAU | Control | T1 | T0 | 0.663 | 168000000 | 159000000 | decrease |
| CAU | Control | T2 | T0 | 0.663 | 148000000 | 159000000 | increase |
| CAU | Control | T2 | T1 | 0.663 | 148000000 | 168000000 | increase |
| CYM | Control | T1 | T0 | 1.000 | 52400000 | 54000000 | increase |
| CYM | Control | T2 | T0 | 0.286 | 26500000 | 54000000 | increase |
| CYM | Control | T2 | T1 | 0.286 | 26500000 | 52400000 | increase |
Although the nonparametric tests did not detect statistically significant within-group changes, the descriptive patterns in the figures suggest some biologically meaningful trends. In particular, Cymodocea under control conditions showed a consistent rise in median cell counts from T0 to T2, and Caulerpa exhibited fluctuations that point to a potential increase by the final time point. The absence of statistical significance likely reflects the limited sample size and variability within groups, rather than a complete lack of temporal dynamics. Thus, while the formal tests provide little evidence for strong within-group effects, the graphical trends indicate subtle trajectories that may become clearer with larger sample sizes or longer experimental follow-up.
Shannon diversity indices were computed with the diversity function in vegan and summarized by Species × Treatment × Time. We first visualized mean trajectories (±SE) together with individual observations to inspect temporal patterns and treatment contrasts.
library(readr)
library(dplyr)
library(ggplot2)
library(grid)
datos <- read_table("cell_shanon_data_12_05_25.txt") %>%
mutate(
Species = factor(Species),
Time = factor(Time, levels = c("T0","T1","T2")),
Treatment = factor(Treatment),
shanon = as.numeric(shanon)
)
summary_data <- datos %>%
group_by(Species, Treatment, Time) %>%
summarise(
mean_shanon = mean(shanon, na.rm = TRUE),
sd = sd(shanon, na.rm = TRUE),
n = n(),
se = sd / sqrt(n),
.groups = "drop"
)
t1_hw <- summary_data %>%
filter(Treatment == "HW", Time == "T1") %>%
select(Species, y = mean_shanon)
t2_hw <- summary_data %>%
filter(Treatment == "HW", Time == "T2") %>%
select(Species, yend = mean_shanon)
segments_hw <- left_join(t1_hw, t2_hw, by = "Species")
t12 <- summary_data %>% filter(Time %in% c("T1","T2"))
rango12 <- range(t12$mean_shanon - t12$se, t12$mean_shanon + t12$se)
ymin12 <- rango12[1] - 0.55 * diff(rango12)
ymax12 <- rango12[2] + 0.25 * diff(rango12)
p_d<- ggplot() +
geom_jitter(
data = datos,
aes(x = Time, y = shanon, color = Species, shape = Treatment),
width = 0.1, alpha = 0.4, size = 1.2
) +
geom_line(
data = summary_data,
aes(x = Time, y = mean_shanon,
color = Species,
linetype = Treatment,
group = interaction(Species, Treatment)),
size = 1
) +
geom_point(
data = summary_data,
aes(x = Time, y = mean_shanon, color = Species, shape = Treatment),
size = 3
) +
geom_errorbar(
data = summary_data,
aes(x = Time,
ymin = mean_shanon - se,
ymax = mean_shanon + se,
color = Species),
width = 0.2, size = 1.25
) +
geom_segment(
data = summary_data,
aes(y = 5.072667,
yend = 5.056000,
color = "CYM"),
x = 2,
xend = 3,
linetype = "dashed",
size = 1.5
) +
geom_segment(
data = summary_data,
aes(y = 5.201667,
yend = 5.180333,
color = "CAU"),
x = 2,
xend = 3,
linetype = "dashed",
size = 1.5
) +
scale_color_manual(name = "Species",
values = c("CAU" = "#008ae6", "CYM" = "#ff751a")) +
scale_shape_manual(name = "Treatment",
values = c("Control" = 16, # circles filled
"HW" = 17)) +# triangles filled
scale_linetype_manual(name = "Treatment",
values = c("Control" = "solid",
"HW" = "dashed")) +
guides(
shape = guide_legend(override.aes = list(size = 3)),
linetype = guide_legend(override.aes = list(size = 1.5))
) +
labs(
x = "Time",
y = "Shannon diversity index",
color = "Species",
shape = "Treatment",
linetype = "Treatment"
) +
theme_minimal(base_size = 12) +
theme(
axis.title.x = element_text(face = "bold"),
axis.title.y = element_text(face = "bold"),
axis.text.x = element_text(face = "bold"),
legend.title = element_text(face = "bold")
) +
coord_cartesian(ylim = c(ymin12, ymax12)) +
scale_y_continuous(
breaks = pretty(c(ymin12, ymax12), n = 5),
expand = expansion(mult = c(0,0))
)
p_d
The plot shows that Caulerpa maintains relatively high and stable diversity across the experiment, with only slight changes between T1 and T2. Cymodocea exhibits a marked rise from T0 to T1, followed by a modest leveling or decline toward T2 under heatwave exposure, while control samples remain comparatively stable. These descriptive trends motivated formal between-species comparisons.
To test whether species differed in Shannon diversity, we applied Wilcoxon rank–sum tests overall (pooling time points) and separately at each sampling time.
## Overall (all times + treatments pooled)
bt_overall <- suppressWarnings(stats::wilcox.test(shanon ~ Species, data = datos, exact = FALSE))
bt_overall_p <- bt_overall$p.value
## By time (T0, T1, T2)
bt_by_time <- datos %>%
group_by(Time) %>%
summarise(
med_CAU = median(shanon[Species == "CAU"], na.rm = TRUE),
med_CYM = median(shanon[Species == "CYM"], na.rm = TRUE),
p = {
d <- cur_data()
suppressWarnings(stats::wilcox.test(d$shanon ~ d$Species, exact = FALSE)$p.value)
},
.groups = "drop"
)
kable(bt_by_time, digits = 3, caption = "Between-species Wilcoxon by time (Shannon)")
| Time | med_CAU | med_CYM | p |
|---|---|---|---|
| T0 | 5.188 | 4.927 | 0.081 |
| T1 | 5.198 | 5.056 | 0.383 |
| T2 | 5.207 | 5.080 | 0.173 |
bt_overall_p
## [1] 0.01300303
The overall Wilcoxon test indicated a species effect across time points (report printed bt_overall_p). By-time Wilcoxon tests (unadjusted p-values as printed in the table) suggest a marginal difference at T0, with higher diversity in Caulerpa, whereas differences at T1 and T2 were not statistically significant. Taken together, the figure and tests indicate that Caulerpa generally sustains higher Shannon diversity than Cymodocea, while temporal dynamics—especially the T0→T1 increase in Cymodocea and the slight T1→T2 softening under heatwave—are modest in magnitude.
We next examined acclimation effects by directly comparing Shannon diversity between T0 and T1 for each species.
acclim <- datos %>%
group_by(Species) %>%
summarise(
med_T0 = median(shanon[Time == "T0"], na.rm = TRUE),
med_T1 = median(shanon[Time == "T1"], na.rm = TRUE),
p = {
x <- shanon[Time == "T0"]; y <- shanon[Time == "T1"]
if (length(x) > 0 && length(y) > 0)
suppressWarnings(stats::wilcox.test(x, y, exact = FALSE)$p.value)
else NA_real_
},
direction = case_when(med_T1 > med_T0 ~ "increase",
med_T1 < med_T0 ~ "decrease",
TRUE ~ "no_change"),
.groups = "drop"
)
kable(acclim, digits = 3, caption = "T0 vs T1 (acclimation) by species")
| Species | med_T0 | med_T1 | p | direction |
|---|---|---|---|---|
| CAU | 5.188 | 5.198 | 0.663 | increase |
| CYM | 4.927 | 5.056 | 0.383 | increase |
During the acclimation period (T0 → T1), Shannon diversity showed a slight increase in both Caulerpa and Cymodocea sediments (median values rising from 5.19 to 5.20 and from 4.93 to 5.06, respectively). However, these changes were not statistically significant (p = 0.66 for Caulerpa, p = 0.38 for Cymodocea), indicating that although diversity tended to rise during acclimation, the magnitude of the increase was modest and within the range of sampling variability.
To assess whether Shannon diversity varied over time in the absence of heatwave stress, we applied Kruskal–Wallis tests separately for Caulerpa and Cymodocea under control conditions. This approach evaluates overall differences across the three sampling times (T0, T1, T2) without assuming normality.
datos <- read_table("cell_shanon_data_12_05_25_complete.txt") %>%
mutate(
Species = factor(Species),
Time = factor(Time, levels = c("T0","T1","T2")),
Treatment = factor(Treatment),
cells = as.numeric(cells)
)
kw_ctrl <- datos %>%
filter(Treatment == "Control") %>%
group_by(Species) %>%
summarise(
kw_p = {
d <- cur_data()
suppressWarnings(kruskal.test(shanon ~ Time, data = d)$p.value)
},
.groups = "drop"
)
kable(kw_ctrl, digits = 3, caption = "Kruskal–Wallis test across T0–T1–T2 under control conditions (by species)")
| Species | kw_p |
|---|---|
| CAU | 0.733 |
| CYM | 0.432 |
The results (Table) show no significant temporal variation in Shannon diversity under control conditions (p = 0.733 for Caulerpa, p = 0.432 for Cymodocea). This indicates that, in the absence of heatwave exposure, microbial diversity remained relatively stable across the course of the experiment in both host species.
To evaluate whether heatwave exposure altered the temporal trajectory of Shannon diversity, we applied Kruskal–Wallis tests across the three time points within each species.
kw_hw <- datos %>%
filter(Treatment == "HW") %>%
group_by(Species) %>%
summarise(
kw_p = {
d <- cur_data()
suppressWarnings(kruskal.test(shanon ~ Time, data = d)$p.value)
},
.groups = "drop"
)
kable(kw_hw, digits = 3, caption = "Kruskal–Wallis across T0–T1–T2 under heatwave conditions (by species)")
| Species | kw_p |
|---|---|
| CAU | 0.875 |
| CYM | 0.288 |
Under heatwave conditions, no significant temporal changes were detected in either species (Caulerpa: p = 0.875; Cymodocea: p = 0.288). These results indicate that, while the figure suggests a slight T1→T2 softening in Cymodocea under heatwave exposure, the magnitude of the shift was insufficient to reach statistical significance. Combined with the control analysis, this supports the view that Shannon diversity remained broadly stable over time in both species, with only modest, non-significant fluctuations under heat stress.
To probe short-interval changes under control conditions, we compared Shannon diversity between T1 and T2 for Caulerpa using descriptive summaries and a Wilcoxon test as a robust alternative to the Welch t-test.
target_var <- "shanon"
dat <- datos %>%
filter(Species == "CAU", Treatment == "Control", Time %in% c("T1","T2")) %>%
select(ID = dplyr::any_of("ID"), Time, value = all_of(target_var)) %>%
filter(!is.na(value))
summary_tbl <- dat %>%
group_by(Time) %>%
summarise(
n = dplyr::n(),
mean = mean(value),
sd = sd(value),
se = sd/sqrt(n),
.groups = "drop"
)
knitr::kable(summary_tbl, digits = 3,
caption = "Caulerpa — Control: comparison of Shannon diversity between T1 and T2 (mean, SD, SE, n)")
| Time | n | mean | sd | se |
|---|---|---|---|---|
| T1 | 3 | 5.202 | 0.075 | 0.043 |
| T2 | 3 | 5.195 | 0.047 | 0.027 |
tt_welch <- t.test(value ~ Time, data = dat, var.equal = FALSE)
tt_welch
##
## Welch Two Sample t-test
##
## data: value by Time
## t = 0.13793, df = 3.353, p-value = 0.8982
## alternative hypothesis: true difference in means between group T1 and group T2 is not equal to 0
## 95 percent confidence interval:
## -0.1453022 0.1593022
## sample estimates:
## mean in group T1 mean in group T2
## 5.201667 5.194667
w_test <- stats::wilcox.test(value ~ Time, data = dat, exact = FALSE)
w_test
##
## Wilcoxon rank sum test with continuity correction
##
## data: value by Time
## W = 5, p-value = 1
## alternative hypothesis: true location shift is not equal to 0
The summary table shows nearly identical means at T1 and T2 (5.202 vs 5.195), with small standard errors. Welch’s test confirmed no evidence of a difference (t = 0.138, df = 3.35, p = 0.898; 95% CI for the mean difference: −0.145 to 0.159). Results were consistent with the Welch test, showing no evidence of a difference between time points (Wilcoxon p = 1). Given the very small sample size (n = 3 per time), these findings should be interpreted as compatible with negligible change rather than proof of no effect. Thus, Shannon diversity in Caulerpa remained stable between T1 and T2 under control conditions.
To check short-interval changes under heatwave conditions, we compared Shannon diversity between T1 and T2 for Caulerpa using descriptive summaries and a Wilcoxon test as a robust alternative to the Welch t-test.
dat <- datos %>%
filter(Species == "CAU", Treatment == "HW", Time %in% c("T1","T2")) %>%
select(ID = dplyr::any_of("ID"), Time, value = all_of(target_var)) %>%
filter(!is.na(value))
summary_tbl <- dat %>%
group_by(Time) %>%
summarise(
n = dplyr::n(),
mean = mean(value),
sd = sd(value),
se = sd/sqrt(n),
.groups = "drop"
)
knitr::kable(summary_tbl, digits = 3,
caption = "Caulerpa – HW: T1 vs T2 (mean, SD, n)")
| Time | n | mean | sd | se |
|---|---|---|---|---|
| T1 | 3 | 5.202 | 0.075 | 0.043 |
| T2 | 3 | 5.180 | 0.139 | 0.080 |
tt_welch <- t.test(value ~ Time, data = dat, var.equal = FALSE)
tt_welch
##
## Welch Two Sample t-test
##
## data: value by Time
## t = 0.23467, df = 3.0672, p-value = 0.8293
## alternative hypothesis: true difference in means between group T1 and group T2 is not equal to 0
## 95 percent confidence interval:
## -0.2644285 0.3070952
## sample estimates:
## mean in group T1 mean in group T2
## 5.201667 5.180333
w_test <- stats::wilcox.test(value ~ Time, data = dat, exact = FALSE)
w_test
##
## Wilcoxon rank sum test with continuity correction
##
## data: value by Time
## W = 4, p-value = 1
## alternative hypothesis: true location shift is not equal to 0
The summary table shows very similar means at T1 and T2 (5.202 vs 5.180) with small standard errors. Both Welch’s test (t = 0.235, df = 3.07, p = 0.829; 95% CI for the mean difference: –0.264 to 0.308) and the Wilcoxon rank-sum test (W = 4, p = 1.00) indicated no evidence of a difference between the two time points. Thus, under heatwave conditions, Caulerpa diversity remained essentially stable between T1 and T2. Given the small sample size (n = 3 per time), these results should be interpreted as consistent with a negligible effect rather than proof of no effect.
dat <- datos %>%
filter(Species == "CYM", Treatment == "Control", Time %in% c("T1","T2")) %>%
select(ID = dplyr::any_of("ID"), Time, value = all_of(target_var)) %>%
filter(!is.na(value))
summary_tbl <- dat %>%
group_by(Time) %>%
summarise(
n = dplyr::n(),
mean = mean(value),
sd = sd(value),
se = sd/sqrt(n),
.groups = "drop"
)
knitr::kable(summary_tbl, digits = 3,
caption = "Cymodocea – Control: T1 vs T2 (mean, SD, n)")
| Time | n | mean | sd | se |
|---|---|---|---|---|
| T1 | 3 | 5.073 | 0.165 | 0.095 |
| T2 | 3 | 5.084 | 0.203 | 0.117 |
tt_welch <- t.test(value ~ Time, data = dat, var.equal = FALSE)
tt_welch
##
## Welch Two Sample t-test
##
## data: value by Time
## t = -0.077316, df = 3.8365, p-value = 0.9422
## alternative hypothesis: true difference in means between group T1 and group T2 is not equal to 0
## 95 percent confidence interval:
## -0.4377554 0.4144221
## sample estimates:
## mean in group T1 mean in group T2
## 5.072667 5.084333
w_test <- stats::wilcox.test(value ~ Time, data = dat, exact = FALSE)
w_test
##
## Wilcoxon rank sum test with continuity correction
##
## data: value by Time
## W = 4.5, p-value = 1
## alternative hypothesis: true location shift is not equal to 0
The summary statistics indicate that Cymodocea maintained very similar Shannon diversity values between T1 (mean = 5.073, SE = 0.095) and T2 (mean = 5.084, SE = 0.117). Both Welch’s two-sample t-test (t = –0.077, df = 3.84, p = 0.942) and the Wilcoxon rank-sum test (W = 5, p = 1.00) confirmed the absence of significant differences between the two time points. These findings suggest that under control conditions, Cymodocea diversity remained stable across the short interval from T1 to T2, with no detectable changes despite minor variation in the group means.
dat <- datos %>%
filter(Species == "CYM", Treatment == "HW", Time %in% c("T1","T2")) %>%
select(ID = dplyr::any_of("ID"), Time, value = all_of(target_var)) %>%
filter(!is.na(value))
summary_tbl <- dat %>%
group_by(Time) %>%
summarise(
n = dplyr::n(),
mean = mean(value),
sd = sd(value),
se = sd/sqrt(n),
.groups = "drop"
)
knitr::kable(summary_tbl, digits = 3,
caption = "Cymodocea – HW: T1 vs T2 (mean, SD, n)")
| Time | n | mean | sd | se |
|---|---|---|---|---|
| T1 | 3 | 5.073 | 0.165 | 0.095 |
| T2 | 3 | 5.056 | 0.106 | 0.061 |
tt_welch <- t.test(value ~ Time, data = dat, var.equal = FALSE)
tt_welch
##
## Welch Two Sample t-test
##
## data: value by Time
## t = 0.14762, df = 3.4062, p-value = 0.891
## alternative hypothesis: true difference in means between group T1 and group T2 is not equal to 0
## 95 percent confidence interval:
## -0.3195780 0.3529113
## sample estimates:
## mean in group T1 mean in group T2
## 5.072667 5.056000
w_test <- stats::wilcox.test(value ~ Time, data = dat, exact = FALSE)
w_test
##
## Wilcoxon rank sum test with continuity correction
##
## data: value by Time
## W = 4, p-value = 1
## alternative hypothesis: true location shift is not equal to 0
The descriptive statistics for Cymodocea under heatwave conditions show nearly identical Shannon diversity means at T1 (5.073, SE = 0.095) and T2 (5.056, SE = 0.061). Welch’s two-sample t-test found no evidence of a difference (t = 0.148, df = 3.41, p = 0.891; 95% CI: –0.320 to 0.352), and the Wilcoxon rank-sum test likewise indicated no shift in location (W = 5, p = 1.00).
Datos.Muestras=data.frame(read_excel("TablaOTUs.xlsx"))
Samples.original=Datos.Muestras$ID
DF.0.original=Datos.Muestras[,-c(1,2)]
row.names(DF.0.original)=Samples.original
OTUs=paste("OTU",1:dim(DF.0.original)[2],sep="")
names(DF.0.original)=OTUs
Sizes=rowSums(DF.0.original)
Tipos.original=as.factor(rep(c("CAU_T0","CAU_T1","CAU_T2C", "CAU_T2HW","CYM_T0","CYM_T1", "CYM_T2C","CYM_T2HW"),each=3))
We have quantified the biodiversity of every type of samples CAU_T* or CYM_T* by means of the Shannon index of the sample obtained by merging the 3 samples of that type. To test which pairs of sample types had significantly different Shannon indices, we have used the Hutcheson t-test, as implemented in function Hutcheson_t_test of the R package ecolTest, and adjusted the p-values with the Holm method.
# Shannon index
SH=function(x){
vegan::diversity(x,index = "shannon")
}
p.lnp2=function(p){
if (p==0){return(0)}
else {
return(p*(log(p)^2))
}
}
# Hutcheson variance estimator
Var.SH=function(x){
N=sum(x)
y=sapply(x/sum(x),FUN=p.lnp2)
bit=sapply(x,FUN=function(x){min(x,1)})
S=sum(bit)
vv=(sum(y)-SH(x)^2)/N+(S-1)/(2*N^2)
return(vv)
}
# Standard deviation estimator
sd.SH=function(x){
sqrt(Var.SH(x))
}
Samples=Samples.original
DF.0=DF.0.original
Tipos=Tipos.original
DF.0.agrup=aggregate(DF.0,by=list(Tipos),FUN=sum)[,-1]
For each sample, we compute its Shannon index and an estimation of its standard deviation (sd):
Resultados=data.frame(Samples,
round(apply(DF.0,MARGIN=1,SH),4),
round(apply(DF.0,MARGIN=1,sd.SH),4))
row.names(Resultados)=NULL
names(Resultados)=c("Samples","SH","sd")
flextable(Resultados)|>
fontsize(size = 8, part = "all")|>
width(width = 0.9)
Samples | SH | sd |
|---|---|---|
CAU_T0_1 | 5.2297 | 0.0099 |
CAU_T0_2 | 5.0583 | 0.0083 |
CAU_T0_3 | 5.1884 | 0.0108 |
CAU_T1_1 | 5.2781 | 0.0098 |
CAU_T1_2 | 5.1297 | 0.0099 |
CAU_T1_3 | 5.1981 | 0.0104 |
CAU_T2C_1 | 5.1477 | 0.0102 |
CAU_T2C_2 | 5.1980 | 0.0096 |
CAU_T2C_3 | 5.2405 | 0.0109 |
CAU_T2HW_1 | 5.2974 | 0.0072 |
CAU_T2HW_2 | 5.0278 | 0.0085 |
CAU_T2HW_3 | 5.2167 | 0.0078 |
CYM_T0_1 | 4.8018 | 0.0083 |
CYM_T0_2 | 4.9936 | 0.0098 |
CYM_T0_3 | 4.9272 | 0.0081 |
CYM_T1_1 | 4.9171 | 0.0052 |
CYM_T1_2 | 5.0564 | 0.0056 |
CYM_T1_3 | 5.2460 | 0.0065 |
CYM_T2C_1 | 5.3003 | 0.0063 |
CYM_T2C_2 | 5.0557 | 0.0063 |
CYM_T2C_3 | 4.8975 | 0.0066 |
CYM_T2HW_1 | 4.9353 | 0.0067 |
CYM_T2HW_2 | 5.1045 | 0.0064 |
CYM_T2HW_3 | 5.1299 | 0.0066 |
The next graphic depicts, for each sample, its Shannon index \(SH\) and the error bar \(SH\pm 2\cdot sd\).
row.names(Resultados)=Samples
plot(0.1*(1:24),Resultados$SH,
col=rep(c(rep("green",3),rep("blue",3),rep("purple",3),rep("red",3)),2),
pch=c(rep(20,12),rep(18,12)),cex=1,
xlab="",ylab="Shannon index",xaxt='n',
main="All samples")
legend("bottomleft",col=c("black","black","green","blue","purple","red"),
pch=c(20,18,NA,NA,NA,NA),
lty=c(NA,NA,1,1,1,1),
legend=c("CAU","CYM","T0","T1","T2C","T2HW"),
seg.len=0.5,cex=0.5)
segments(0.1*(1:24),Resultados$SH-2*Resultados$sd,
0.1*(1:24),Resultados$SH+2*Resultados$sd,
col=rep(c(rep("green",3),rep("blue",3),rep("purple",3),rep("red",3)),2),lwd=1.5)
We repeat the process for sample types: for each sample type, we merge all three samples of that type into a single sample. We include the intervals (SH-3·sd,SH+3·sd). The reason for the factor 3 is that there are 36 pairs of samples to compare, and hence we use as factor for sd the 0.951/36 quantile of the standard normal distribution. In this way, disjoint intervals give statistical evidence (at the 5% signification level) that the corresponding two indices are different.
Resultados.agrupados=data.frame(levels(Tipos),
round(apply(DF.0.agrup,MARGIN=1,SH),4),
round(apply(DF.0.agrup,MARGIN=1,sd.SH),4),
round(apply(DF.0.agrup,MARGIN=1,SH)-3*apply(DF.0.agrup,MARGIN=1,sd.SH),4),
round(apply(DF.0.agrup,MARGIN=1,SH)+3*apply(DF.0.agrup,MARGIN=1,sd.SH),4)
)
row.names(Resultados.agrupados)=NULL
names(Resultados.agrupados)=c("Samples","SH","sd","SH-3sd","SH+3sd")
flextable(Resultados.agrupados)|>
fontsize(size = 8, part = "all")|>
width(width = 0.9)
Samples | SH | sd | SH-3sd | SH+3sd |
|---|---|---|---|---|
CAU_T0 | 5.2009 | 0.0056 | 5.1842 | 5.2176 |
CAU_T1 | 5.2607 | 0.0059 | 5.2428 | 5.2785 |
CAU_T2C | 5.2590 | 0.0060 | 5.2410 | 5.2770 |
CAU_T2HW | 5.2457 | 0.0046 | 5.2319 | 5.2594 |
CYM_T0 | 4.9321 | 0.0051 | 4.9168 | 4.9474 |
CYM_T1 | 5.1190 | 0.0034 | 5.1089 | 5.1291 |
CYM_T2C | 5.1226 | 0.0038 | 5.1113 | 5.1338 |
CYM_T2HW | 5.0875 | 0.0039 | 5.0759 | 5.0991 |
plot(0.1*(1:8),Resultados.agrupados$SH,col=rep(c("green","blue","purple","red"),2),
pch=c(rep(20,4),rep(18,4)),
xlab="",ylab="Shannon index",xaxt='n',
main="All sample types")
segments(0.1*(1:8),Resultados.agrupados$SH-3*Resultados.agrupados$sd,
0.1*(1:8),Resultados.agrupados$SH+3*Resultados.agrupados$sd,
col=rep(c("green","blue","purple","red"),2),lwd=1.5)
legend("bottomleft",col=c("black","black","green","blue","purple","red"),
pch=c(20,18,NA,NA,NA,NA),
lty=c(NA,NA,1,1,1,1),
legend=c("CAU","CYM","T0","T1","T2C","T2HW"),
seg.len=0.5,cex=0.5)
SH indices of CAU samples are significantly different from (and actually significantly larger than) those of CYM samples.
plot(0.1*(1:4),Resultados.agrupados$SH[1:4],col=c("green","blue","purple","red"),
pch=20,main="CAU sample types",
xlab="",ylab="Shannon index",xaxt='n',
ylim=c(min(Resultados.agrupados$SH[1:4]-3*Resultados.agrupados$sd[1:4]),
max(Resultados.agrupados$SH[1:4]+3*Resultados.agrupados$sd[1:4])))
segments(0.1*(1:4),Resultados.agrupados$SH[1:4]-3*Resultados.agrupados$sd[1:4],
0.1*(1:4),Resultados.agrupados$SH[1:4]+3*Resultados.agrupados$sd[1:4],
col=c("green","blue","purple","red"),lwd=1.5)
legend("topright",col=c("green","blue","purple","red"),
pch=20,legend=c("T0","T1","T2C","T2HW"),seg.len=0.5,cex=0.5)
Only the SH index of T0 samples seems to be significantly different from the others.
plot(0.1*(1:4),Resultados.agrupados$SH[5:8],col=c("green","blue","purple","red"),
pch=18,main="CYM sample types",
xlab="",ylab="Shannon index",xaxt='n', ylim=c(min(Resultados.agrupados$SH[5:8]-3*Resultados.agrupados$sd[5:8]),max(Resultados.agrupados$SH[5:8]+3*Resultados.agrupados$sd[5:8])))
segments(0.1*(1:4),Resultados.agrupados$SH[5:8]-3*Resultados.agrupados$sd[5:8],
0.1*(1:4),Resultados.agrupados$SH[5:8]+3*Resultados.agrupados$sd[5:8],
col=c("green","blue","purple","red"),lwd=1.5)
legend("topleft",col=c("green","blue","purple","red"),
pch=18,legend=c("T0","T1","T2C","T2HW"),seg.len=0.5,cex=0.5)
All four SH indices are significantly different, except for T1 and T2C.
We can use Hutcheson t-test to decide whether Shannon indices are significantly different. We perform pairwise comparisons and adjust the p-values using the Holm method. The conclusions are the same.
pvalSH.G=matrix(NA,nrow=8,ncol=8)
for (i in 1:7){
for (j in (i+1):8){
pvalSH.G[i,j]= Hutcheson_t_test(DF.0.agrup[i,],DF.0.agrup[j,])$p.value
}
}
pvalSH.G=matrix(p.adjust(pvalSH.G,method="holm"),nrow=8)
colnames(pvalSH.G)=levels(Tipos)
row.names(pvalSH.G)=levels(Tipos)
flextable(data.frame(round(pvalSH.G,6)))|>
fontsize(size = 8, part = "all")|>
width(width = 0.9)
CAU_T0 | CAU_T1 | CAU_T2C | CAU_T2HW | CYM_T0 | CYM_T1 | CYM_T2C | CYM_T2HW |
|---|---|---|---|---|---|---|---|
0 | 0.000000 | 0.000000 | 0 | 0 | 0.000000 | 0 | |
0.950346 | 0.179925 | 0 | 0 | 0.000000 | 0 | ||
0.228577 | 0 | 0 | 0.000000 | 0 | |||
0 | 0 | 0.000000 | 0 | ||||
0 | 0.000000 | 0 | |||||
0.950346 | 0 | ||||||
0 | |||||||
We address the following question: If we set a threshold \(p_0\), we only retain from each sample the OTUs that appear with a relative frequency \(\geqslant p_0\) and calculate the Shannon index (SH) of those samples, how does its longitudinal behavior vary from T0->T1->T2C->T2HW as \(p_0\) increases?
We use the following method to answer this question:
We obtain that, in the CAU samples, at some point between the 0.15% and 0.2% thresholds, and in the CYM samples, at some point between the 0.1% and 0.15% thresholds, the longitudinal behavior of the SH changes for the first time from that observed in the whole samples, shifting from a rise-fall-fall pattern to a rise-fall-rise pattern.
For each type of CAU sample, around 12-13% of OTUs present in at least one sample of this type have relative frequency in all three samples of that type greater than or equal to 0.1%. For CYM sample types, this figure descends to 9-10%.
# threshold
llindar=function(x,l){
if (x>=l){return(1)}
else{return(0)}
}
#Global sample
DF.0=as.matrix(DF.0)
DF.0.prop=DF.0/rowSums(DF.0)
DF.0.agrup=as.matrix(DF.0.agrup)
DF.0.agrup.prop=DF.0.agrup/rowSums(DF.0.agrup)
Original sample sizes and range of proportions within them:
Tamaños=rowSums(DF.0)
Props.min=round(apply(DF.0.prop,MARGIN=1,function(x){min(x[x>0])}),6)
Props.max=apply(DF.0.prop,MARGIN=1,function(x){max(x[x>0])})
Info=data.frame(Tamaños,Props.min,Props.max)
names(Info)=c("Size","Smallest prop.","Largest prop.")
flextable(Info)|>
fontsize(size = 8, part = "all")|>
width(width = 0.9)
Size | Smallest prop. | Largest prop. |
|---|---|---|
32452 | 0.000031 | 0.05805497 |
45161 | 0.000022 | 0.05832466 |
26298 | 0.000038 | 0.05354019 |
31355 | 0.000032 | 0.05243183 |
30592 | 0.000033 | 0.05458944 |
29368 | 0.000034 | 0.07147235 |
30554 | 0.000033 | 0.05194083 |
33119 | 0.000030 | 0.05694616 |
26568 | 0.000038 | 0.06884222 |
62616 | 0.000016 | 0.06188514 |
43726 | 0.000023 | 0.06026163 |
50715 | 0.000020 | 0.05223307 |
57062 | 0.000018 | 0.10017174 |
38684 | 0.000026 | 0.09365629 |
56482 | 0.000018 | 0.09686272 |
145523 | 0.000007 | 0.11633900 |
122687 | 0.000008 | 0.08515980 |
82489 | 0.000012 | 0.08112597 |
91168 | 0.000011 | 0.06932257 |
97111 | 0.000010 | 0.09009278 |
97125 | 0.000010 | 0.12887516 |
88157 | 0.000011 | 0.14189457 |
94560 | 0.000011 | 0.11748096 |
89922 | 0.000011 | 0.11695692 |
Resultados.Agrupados=data.frame(
SH=apply(DF.0.agrup,MARGIN=1,SH),
Varianza=apply(DF.0.agrup,MARGIN=1,Var.SH))
##
plot(0.1*(1:8),Resultados.Agrupados$SH,col=rep(c("green","blue","purple","red"),2),
pch=c(rep(20,4),rep(18,4)),
xlab="",ylab="SH",xaxt='n',
main="Grouped samples")
segments(0.1*(1:8),Resultados.Agrupados$SH-3*sqrt(Resultados.Agrupados$Varianza),
0.1*(1:8),Resultados.Agrupados$SH+3*sqrt(Resultados.Agrupados$Varianza),
col=rep(c("green","blue","purple","red"),2),lwd=1.5)
legend("topright",col=c("black","black","green","blue","purple","red"),
pch=c(20,18,NA,NA,NA,NA),
lty=c(NA,NA,1,1,1,1),
legend=c("CAU","CYM","T0","T1","T2C","T2HW"),
seg.len=0.5,cex=0.5)
#
plot(0.1*(1:4),Resultados.Agrupados$SH[1:4],col=c("green","blue","purple","red"),
pch=20,main="Grouped samples: CAU",
xlab="",ylab="SH",xaxt='n',
ylim=c(min(Resultados.Agrupados$SH[1:4]-3*sqrt(Resultados.Agrupados$Varianza[1:4])),
max(Resultados.Agrupados$SH[1:4]+3*sqrt(Resultados.Agrupados$Varianza[1:4]))))
segments(0.1*(1:4),Resultados.Agrupados$SH[1:4]-3*sqrt(Resultados.Agrupados$Varianza[1:4]),
0.1*(1:4),Resultados.Agrupados$SH[1:4]+3*sqrt(Resultados.Agrupados$Varianza[1:4]),
col=c("green","blue","purple","red"),lwd=1.5)
legend("topleft",col=c("green","blue","purple","red"),
pch=20,legend=c("T0","T1","T2C","T2HW"),cex=0.5)
#
plot(0.1*(1:4),Resultados.Agrupados$SH[5:8],col=c("green","blue","purple","red"),
pch=18,main="Grouped samples: CYM",
xlab="",ylab="SH",xaxt='n', ylim=c(min(Resultados.Agrupados$SH[5:8]-3*sqrt(Resultados.Agrupados$Varianza[5:8])),max(Resultados.Agrupados$SH[5:8]+3*sqrt(Resultados.Agrupados$Varianza[5:8]))))
segments(0.1*(1:4),Resultados.Agrupados$SH[5:8]-3*sqrt(Resultados.Agrupados$Varianza[5:8]),
0.1*(1:4),Resultados.Agrupados$SH[5:8]+3*sqrt(Resultados.Agrupados$Varianza[5:8]),
col=c("green","blue","purple","red"),lwd=1.5)
legend("topleft",col=c("green","blue","purple","red"),
pch=18,legend=c("T0","T1","T2C","T2HW"),cex=0.5)
Resultados=c()
for (i in 1:1000){
print(i)
i0=i*0.00005
llindar.0=function(x){llindar(x,i0)}
DD=vapply(DF.0.prop, llindar.0, numeric(1))*DF.0
DD.agrup=aggregate(DD,by=list(Tipos),FUN=sum)[,-1]
Resultados=rbind(Resultados,
c(apply(DD.agrup,MARGIN=1,SH),#Shannon
apply(DD.agrup,MARGIN=1,sd.SH),#Variança
apply(DD.agrup,MARGIN=1,SH)-3*apply(DD.agrup,MARGIN=1,sd.SH), #LE de l'IC
apply(DD.agrup,MARGIN=1,SH)+3*apply(DD.agrup,MARGIN=1,sd.SH),#UE de l'IC
i0 #llindar
))
}
Resultados=data.frame(Resultados)
names(Resultados)=c(paste("SH",levels(Tipos),sep="."),
paste("sd",levels(Tipos),sep="."),
paste("LE",levels(Tipos),sep="."),
paste("UE",levels(Tipos),sep="."),
"threshold")
saveRDS(Resultados, file="Resultados.Shannon.Umbrales.RData")
p.valores=c()
for (i in 1:1000){
print(i)
i0=i*0.00005
llindar.0=function(x){llindar(x,i0)}
DD=vapply(DF.0.prop, llindar.0, numeric(1))*DF.0
DD.agrup=aggregate(DD,by=list(Tipos),FUN=sum)[,-1]
pvalSH.G=matrix(NA,nrow=8,ncol=8)
for (i in 1:7){
for (j in (i+1):8){
pvalSH.G[i,j]= Hutcheson_t_test(DD.agrup[i,],DD.agrup[j,])$p.value
}
}
pvalSH.G=p.adjust(pvalSH.G,method="holm")
p.valores=rbind(p.valores,
c(i0,
pvalSH.G))
}
write.csv(p.valores, file="p.valores.Shannon.Umbrales.csv",row.names=FALSE)
The results are in the file “Resultados.Shannon.Umbrales.RData”. Its variables are (for each type of sample: CAU_T0, CAU_T1 etc.):
SH.type: The Shannon index (SH) of the union of the three samples of that type
sd.type: The estimated standard deviation of the SH of the union of the three samples of that type
LE.type: SH minus 3 times the estimated standard deviation
UE.type: SH plus 3 times the estimated standard deviation
Threshold: The corresponding threshold
We show its first 10 rows (rounded, and splitted into different tables, for horizontal space reasons), corresponding to thresholds from 0.00005 to 0.0005. :
Resultados=readRDS("Resultados.Shannon.Umbrales.RData")
names(Resultados)=c(paste("SH",levels(Tipos),sep="."),
paste("sd",levels(Tipos),sep="."),
paste("LE",levels(Tipos),sep="."),
paste("UE",levels(Tipos),sep="."),
"Threshold")
Resultados.r4=Resultados
Resultados.r4[,1:32]=round(Resultados.r4[,1:32],4)
flextable(Resultados.r4[1:10,c(33,1+8*(0:3),2+8*(0:3))])|>
fontsize(size = 8, part = "all")|>
width(width = 0.9)
Threshold | SH.CAU_T0 | sd.CAU_T0 | LE.CAU_T0 | UE.CAU_T0 | SH.CAU_T1 | sd.CAU_T1 | LE.CAU_T1 | UE.CAU_T1 |
|---|---|---|---|---|---|---|---|---|
0.00005 | 5.1252 | 0.0053 | 5.1092 | 5.1412 | 5.1997 | 0.0058 | 5.1824 | 5.2170 |
0.00010 | 5.0481 | 0.0051 | 5.0326 | 5.0635 | 5.1092 | 0.0055 | 5.0926 | 5.1257 |
0.00015 | 5.0011 | 0.0050 | 4.9859 | 5.0162 | 5.0479 | 0.0054 | 5.0318 | 5.0641 |
0.00020 | 4.9245 | 0.0049 | 4.9098 | 4.9391 | 4.9787 | 0.0052 | 4.9630 | 4.9944 |
0.00025 | 4.8751 | 0.0048 | 4.8607 | 4.8895 | 4.9337 | 0.0052 | 4.9182 | 4.9491 |
0.00030 | 4.8395 | 0.0047 | 4.8253 | 4.8537 | 4.8735 | 0.0050 | 4.8584 | 4.8887 |
0.00035 | 4.7807 | 0.0046 | 4.7668 | 4.7946 | 4.8427 | 0.0050 | 4.8277 | 4.8577 |
0.00040 | 4.7456 | 0.0046 | 4.7319 | 4.7594 | 4.8042 | 0.0049 | 4.7893 | 4.8190 |
0.00045 | 4.7104 | 0.0045 | 4.6968 | 4.7239 | 4.7588 | 0.0049 | 4.7442 | 4.7733 |
0.00050 | 4.6612 | 0.0044 | 4.6479 | 4.6745 | 4.7291 | 0.0048 | 4.7146 | 4.7435 |
flextable(Resultados.r4[1:10,c(33,3+8*(0:3),4+8*(0:3))])|>
fontsize(size = 8, part = "all")|>
width(width = 0.9)
Threshold | SH.CAU_T2C | sd.CAU_T2C | LE.CAU_T2C | UE.CAU_T2C | SH.CAU_T2HW | sd.CAU_T2HW | LE.CAU_T2HW | UE.CAU_T2HW |
|---|---|---|---|---|---|---|---|---|
0.00005 | 5.1943 | 0.0058 | 5.1769 | 5.2117 | 5.1539 | 0.0043 | 5.1408 | 5.1669 |
0.00010 | 5.1027 | 0.0056 | 5.0860 | 5.1194 | 5.0714 | 0.0042 | 5.0588 | 5.0839 |
0.00015 | 5.0536 | 0.0055 | 5.0372 | 5.0699 | 5.0186 | 0.0041 | 5.0063 | 5.0309 |
0.00020 | 4.9697 | 0.0053 | 4.9539 | 4.9855 | 4.9529 | 0.0040 | 4.9409 | 4.9649 |
0.00025 | 4.9234 | 0.0052 | 4.9078 | 4.9389 | 4.9069 | 0.0039 | 4.8952 | 4.9187 |
0.00030 | 4.8896 | 0.0051 | 4.8743 | 4.9050 | 4.8537 | 0.0038 | 4.8422 | 4.8653 |
0.00035 | 4.8425 | 0.0050 | 4.8274 | 4.8576 | 4.8159 | 0.0038 | 4.8045 | 4.8274 |
0.00040 | 4.8079 | 0.0050 | 4.7929 | 4.8228 | 4.7660 | 0.0037 | 4.7548 | 4.7772 |
0.00045 | 4.7851 | 0.0050 | 4.7702 | 4.7999 | 4.7365 | 0.0037 | 4.7254 | 4.7476 |
0.00050 | 4.7406 | 0.0049 | 4.7259 | 4.7553 | 4.7040 | 0.0037 | 4.6930 | 4.7150 |
flextable(Resultados.r4[1:10,c(33,5+8*(0:3),6+8*(0:3))])|>
fontsize(size = 8, part = "all")|>
width(width = 0.9)
Threshold | SH.CYM_T0 | sd.CYM_T0 | LE.CYM_T0 | UE.CYM_T0 | SH.CYM_T1 | sd.CYM_T1 | LE.CYM_T1 | UE.CYM_T1 |
|---|---|---|---|---|---|---|---|---|
0.00005 | 4.8502 | 0.0049 | 4.8355 | 4.8649 | 5.0004 | 0.0032 | 4.9909 | 5.0100 |
0.00010 | 4.7581 | 0.0047 | 4.7440 | 4.7722 | 4.9137 | 0.0031 | 4.9045 | 4.9229 |
0.00015 | 4.6853 | 0.0046 | 4.6717 | 4.6990 | 4.8593 | 0.0030 | 4.8503 | 4.8684 |
0.00020 | 4.6284 | 0.0045 | 4.6149 | 4.6418 | 4.7987 | 0.0030 | 4.7898 | 4.8075 |
0.00025 | 4.5794 | 0.0044 | 4.5662 | 4.5927 | 4.7475 | 0.0029 | 4.7387 | 4.7562 |
0.00030 | 4.5353 | 0.0044 | 4.5222 | 4.5483 | 4.7035 | 0.0029 | 4.6948 | 4.7121 |
0.00035 | 4.5003 | 0.0043 | 4.4874 | 4.5132 | 4.6659 | 0.0029 | 4.6573 | 4.6745 |
0.00040 | 4.4627 | 0.0043 | 4.4499 | 4.4754 | 4.6107 | 0.0028 | 4.6023 | 4.6191 |
0.00045 | 4.4162 | 0.0042 | 4.4036 | 4.4288 | 4.5693 | 0.0028 | 4.5609 | 4.5776 |
0.00050 | 4.3812 | 0.0042 | 4.3687 | 4.3937 | 4.5383 | 0.0028 | 4.5300 | 4.5466 |
flextable(Resultados.r4[1:10,c(33,7+8*(0:3),8+8*(0:3))])|>
fontsize(size = 8, part = "all")|>
width(width = 0.9)
Threshold | SH.CYM_T2C | sd.CYM_T2C | LE.CYM_T2C | UE.CYM_T2C | SH.CYM_T2HW | sd.CYM_T2HW | LE.CYM_T2HW | UE.CYM_T2HW |
|---|---|---|---|---|---|---|---|---|
0.00005 | 5.0126 | 0.0036 | 5.0019 | 5.0233 | 4.9765 | 0.0037 | 4.9655 | 4.9875 |
0.00010 | 4.9095 | 0.0034 | 4.8992 | 4.9197 | 4.8922 | 0.0036 | 4.8816 | 4.9029 |
0.00015 | 4.8446 | 0.0033 | 4.8346 | 4.8546 | 4.8148 | 0.0035 | 4.8044 | 4.8252 |
0.00020 | 4.7855 | 0.0033 | 4.7756 | 4.7953 | 4.7597 | 0.0034 | 4.7494 | 4.7699 |
0.00025 | 4.7292 | 0.0032 | 4.7195 | 4.7388 | 4.7049 | 0.0034 | 4.6948 | 4.7150 |
0.00030 | 4.6798 | 0.0032 | 4.6702 | 4.6893 | 4.6562 | 0.0033 | 4.6462 | 4.6661 |
0.00035 | 4.6419 | 0.0032 | 4.6324 | 4.6513 | 4.6095 | 0.0033 | 4.5997 | 4.6193 |
0.00040 | 4.5895 | 0.0031 | 4.5802 | 4.5988 | 4.5639 | 0.0032 | 4.5542 | 4.5736 |
0.00045 | 4.5482 | 0.0031 | 4.5390 | 4.5575 | 4.5322 | 0.0032 | 4.5226 | 4.5418 |
0.00050 | 4.5203 | 0.0030 | 4.5111 | 4.5294 | 4.5011 | 0.0032 | 4.4915 | 4.5106 |
We start by checking several thresholds:
Dibu.global=function(p){
Tall=as.matrix(Resultados[Resultados$Threshold==p,])
plot(0.1*(1:8),Tall[1,1:8],
col=rep(c("green","blue","purple","red"),2),
pch=c(rep(20,4),rep(18,4)),
xlab="",ylab="SH",xaxt='n',main=paste("Threshold",p,sep=" "))
segments(x0=0.1*(1:8),y0=Tall[1,17:24],
x1=0.1*(1:8),y1=Tall[1,25:32],
col=rep(c("green","blue","purple","red"),2),lwd=1.5)
legend("topright",col=c("black","black","green","blue","purple","red"),
pch=c(20,18,NA,NA,NA,NA),
lty=c(NA,NA,1,1,1,1),
legend=c("CAU","CYM","T0","T1","T2C","T2HW"),
seg.len=0.5,cex=0.5)
}
Dibu.global(0.0001)
Dibu.global(0.001)
Dibu.global(0.0025)
Dibu.global(0.005)
Dibu.global("0.0075")
Dibu.global(0.01)
Dibu.global(0.025)
As it can be seen, the longitudinal behavior of Shannon indices changes from p0=0.1% (which is similar to the global behaviour) to 0.25%.
Let us see the graphics for thresholds between 0.001 and 0.005:
for (i in 0:8){
Dibu.global(0.001+0.0005*i)
}
For each type of samples, the next graph represents the proportion of OTUs present in at least one sample of this type whose relative frequency in all three samples of that type is greater than or equal to the percentage indicated on the x-axis. The proportions are also given in a table.
n=seq(from=0.001,to=0.003,by=0.0005)
N=length(n)
Result=n
for (i in 1:8){
X1=DF.0.prop[3*(i-1)+1,]
X1=sort(X1[X1!=0])
X2=DF.0.prop[3*(i-1)+2,]
X2=sort(X2[X2!=0])
X3=DF.0.prop[3*(i-1)+3,]
X3=sort(X3[X3!=0])
PP=rep(0,N)
for(j in 1:N){PP[j]=length(X1[X1>=n[j]] & X2[X2>=n[j]] & X3[X3>=n[j]])/length(X1[X1>0] | X2[X2>0] | X3[X3>0])}
plot(100*n,PP,pch=20,type="l",lwd=1.5,xaxp=c(0.1,0.3,N),yaxp=c(0,0.2,20),xlab="%",ylab="Proportion of OTUs whose percentage is higher than ...",main=Tipos[i])
abline(v=0.01*(0:100),lwd=0.5,col="red")
abline(h=0.01*(0:100),lwd=0.5,col="red")
abline(v=0.15,lwd=1,col="blue")
Result=cbind(Result,round(PP,4))
}
Result=data.frame(Result)
names(Result)=c("Threshold",levels(Tipos))
row.names(Result)=NULL
flextable(Result)|>
fontsize(size = 8, part = "all")|>
width(width = 0.9)
Threshold | CAU_T0 | CAU_T1 | CAU_T2C | CAU_T2HW | CYM_T0 | CYM_T1 | CYM_T2C | CYM_T2HW |
|---|---|---|---|---|---|---|---|---|
0.0010 | 0.1269 | 0.1283 | 0.1394 | 0.1162 | 0.1027 | 0.0896 | 0.0922 | 0.0901 |
0.0015 | 0.0968 | 0.0929 | 0.1004 | 0.0829 | 0.0759 | 0.0662 | 0.0643 | 0.0683 |
0.0020 | 0.0751 | 0.0765 | 0.0797 | 0.0652 | 0.0610 | 0.0508 | 0.0539 | 0.0550 |
0.0025 | 0.0609 | 0.0650 | 0.0632 | 0.0546 | 0.0521 | 0.0388 | 0.0446 | 0.0459 |
0.0030 | 0.0509 | 0.0526 | 0.0563 | 0.0446 | 0.0417 | 0.0342 | 0.0348 | 0.0363 |
According to the Bray-Curtis distance (correctly applied to compositions), the CAU and CYM samples at T0 are more different than at T1, more similar at T1 than at T2C, and again more similar at T2HW than at T2C (or than at T1 or T0, for that matter). Overall, the global sets of CAU and CYM samples are more different than the starting T0 samples, as if part of the original differences were diluted. Are these differences statistically significant?
Well, we have devised a method to assess the statistical significance of the differences in median distances between two pairs of samples groups, based on a resampling procedure. Using this method, we obtain that these convergences and divergences from T0 to T1, from T1 to T2C, from T2C to T2HW, and from T0 to the global sample, are statistically significant.
DF.0.agrup=aggregate(DF.0,by=list(Tipos),FUN=sum)[,-1]
The next table shows, for each level T0, T1, T2C, T2HW, and Global (all samples), the median Bray-Curtis (BC) distances between the compositions of the CAU and CYM samples at those levels.
Times | Average BC-dist. |
|---|---|
T0 | 0.4124 |
T1 | 0.4097 |
T2C | 0.4493 |
T2HW | 0.3766 |
Global | 0.4189 |
The BC-distances decrease from T0 to T1. Then, from T1 to T2C they increase, while from T1 to T2HW they decrease further. Also, from T0 to Global it decreases. Are these “convergences” and “divergences” of samples statistically significant?
We want to perform a contrast with null hypothesis “The CAU_T1 and CYM_T1 samples are as similar as the CAU_T0 and CYM_T0 samples” and alternative hypothesis “The CAU_T1 and CYM_T1 samples are more similar than the CAU_T0 and CYM_T0 samples”.
To solve this contrast, we resort to a resampling process. The broad idea is:
On the one hand, we merge all 3 samples CAU_T0 in a single sample, and all 3 samples CYM_T0 in another single sample, and we repeat 1000 times the following:
In this way, we obtain a vector of 1000 median distances between CAU and CYM for T0
On the other hand, we merge all 6 samples CAU_T0 and CAU_T1 in a single sample, and all 6 samples CYM_T0 and CYM_T1 in another single sample, and we repeat 1000 times the following:
In this way, we obtain a vector of 1000 median distances between CAU and CYM for the union of T0 and T1
The idea is that if the T1 samples are “as similar” as the T0 samples, the median distances between triples of disjoint random samples from the merged T0 samples should be similar to the median distances between triples of disjoint random samples (of the same sizes) from the merged T0+T1 samples. Conversely, if the T1 samples are “more similar” than the T0 samples, the median distances between triples of random samples from the merged T0 samples should tend to be larger than the median distances between triples of random samples (of the same sizes) from the merged T0+T1 samples.
So, we can use as p-value the proportion of pairs (median distance between CAU and CYM for T0, median distance between CAU and CYM for T0+T1) where the first entry is smaller than the second entry. If the T1 samples are “as similar” as the T0 samples, we would expect this to happen half of times, while if the T0 samples are more different than the T1 samples, we would expect this to happen with low frequency.
The problem is that the CYM samples are larger than CAU samples, and it can bias the samples; for instance if we merge CAU_T1 and CYM_T1 samples and take random samples, most of the taxa will come from CYM:
Samples | Sizes |
|---|---|
CAU_T0_1 | 32452 |
CAU_T0_2 | 45161 |
CAU_T0_3 | 26298 |
CAU_T1_1 | 31355 |
CAU_T1_2 | 30592 |
CAU_T1_3 | 29368 |
CYM_T0_1 | 57062 |
CYM_T0_2 | 38684 |
CYM_T0_3 | 56482 |
CYM_T1_1 | 145523 |
CYM_T1_2 | 122687 |
CYM_T1_3 | 82489 |
To solve this drawback, and since we apply BC-distances to proportions, we reescale all samples to size 106 (up to rounding; we need integer frequencies for the simulations) before merging.
DF=round(DF.0.prop*10^6)
DF.prop=DF/rowSums(DF)
DF.agrup=aggregate(DF,by=list(Tipos),FUN=sum)[,-1]
Sizes=rowSums(DF)
The new sample sizes are
Samples | Sizes |
|---|---|
CAU_T0_1 | 1000070 |
CAU_T0_2 | 999907 |
CAU_T0_3 | 999911 |
CAU_T1_1 | 1000070 |
CAU_T1_2 | 1000030 |
CAU_T1_3 | 999912 |
CYM_T0_1 | 1000234 |
CYM_T0_2 | 1000111 |
CYM_T0_3 | 1000043 |
CYM_T1_1 | 1000078 |
CYM_T1_2 | 999876 |
CYM_T1_3 | 999852 |
Let us check first that with these reescaled samples (where, due to rounding, the proportions are not exactly the original ones but very close to them), the behaviour of the distances is the same as the original one:
Times | Average BC-dist. |
|---|---|
T0 | 0.4124 |
T1 | 0.4097 |
T2C | 0.4493 |
T2HW | 0.3766 |
Global | 0.4189 |
No change.
We shall use the following function:
#' II indexes of the CAU grouped samples
#' The first one is the "base" time
#' Example: CAU T0+T1 and CYM T0+T1 vs CAU T0 and CYM T0
#' II=c(1,2)
#' Example: CAU_T0 vs CYM_T0
#' II=c(1)
Simulation=function(II,distancia){
#CAU
XX1=c()
for (i in 1:length(II)){
XX1=c(XX1,rep(OTUs,DF.agrup[II[i],]))
}
XX1=factor(XX1,levels=OTUs)
ind1=1:length(XX1)
#CYM
XX2=c()
for (i in 1:length(II)){
XX2=c(XX2,rep(OTUs,DF.agrup[4+II[i],]))
}
XX2=factor(XX2,levels=OTUs)
ind2=1:length(XX2)
# sizes of base samples
SS=Sizes[c(3*(II[1]-1)+1:3,3*((4+II[1])-1)+1:3)]
ind1.1=sample(ind1,SS[1],replace=FALSE)
ind1.2=sample(ind1[-ind1.1],SS[2],replace=FALSE)
ind1.3=sample(ind1[-c(ind1.1,ind1.2)],SS[3],replace=FALSE)
ind2.1=sample(ind2,SS[4],replace=FALSE)
ind2.2=sample(ind2[-ind2.1],SS[5],replace=FALSE)
ind2.3=sample(ind2[-c(ind2.1,ind1.2)],SS[6],replace=FALSE)
Y1.1=prop.table(table(factor(XX1[ind1.1],levels=OTUs)))
Y1.2=prop.table(table(factor(XX1[ind1.2],levels=OTUs)))
Y1.3=prop.table(table(factor(XX1[ind1.3],levels=OTUs)))
Y2.1=prop.table(table(factor(XX2[ind2.1],levels=OTUs)))
Y2.2=prop.table(table(factor(XX2[ind2.2],levels=OTUs)))
Y2.3=prop.table(table(factor(XX2[ind2.3],levels=OTUs)))
YY=rbind(Y1.1,Y1.2,Y1.3,Y2.1,Y2.2,Y2.3)
BC=c()
for (i in 1:3){
for (j in 4:6){
BC=c(BC,vegan::vegdist(YY[c(i,j),], method=distancia))
}
}
median(BC)
}
For reproducibility, we fix a random seed
#initial_seed=as.integer(Sys.time())
#print(initial_seed)
## [1] 1725986963
#initial_seed%%10000
## [1] 6963
set.seed(6963)
Let us perform the desired contrast for T0 vs T1 and the BC-distance
Sim.T0T1.T0.BC=replicate(1000,Simulation(c(1,2),"bray"))
Sim.T0.BC=replicate(1000,Simulation(c(1),"bray"))
saveRDS(Sim.T0T1.T0.BC, file="Sim.T0T1.T0.BC.RData")
saveRDS(Sim.T0.BC, file="Sim.T0.BC.RData")
The median BC-distances for the random T0+T1 samples go from 0.3877 to 0.3895, while the median BC-distances for the T0 samples go from 0.4057 to 0.407. So, all median BC-distances for T0 are larger than any median distance for T0+T1. Moreover, recall that the median BC-distance between the (reescaled) CAU_T0 and CYM_T0 samples was 0.4124, larger than the largest median BC-distance for the random T0+T1 samples.
For robustness, let us perform the contrast with T1 as base samples
Sim.T0T1.T1.BC=replicate(1000,Simulation(c(2,1),"bray"))
Sim.T1.BC=replicate(1000,Simulation(c(2),"bray"))
saveRDS(Sim.T0T1.T1.BC, file="Sim.T0T1.T1.BC.RData")
saveRDS(Sim.T1.BC, file="Sim.T1.BC.RData")
In this case, the median BC-distances for the T0+T1 samples go from 0.3874 to 0.3894, while the median BC-distances for the T1 samples go from 0.3792 to 0.3807. The median BC-distance between the (reescaled) CAU_T1 and CYM_T1 samples was 0.4097.
Now T1 vs T2C, taking T1 as base samples (for the sake of completeness, we repeat the simulation for T1).
Sim.T1T2C.T1.BC=replicate(1000,Simulation(c(2,3),"bray"))
Sim.T1.BC.2=replicate(1000,Simulation(c(2),"bray"))
saveRDS(Sim.T1T2C.T1.BC, file="Sim.T1T2C.T1.BC.RData")
saveRDS(Sim.T1.BC.2, file="Sim.T1.BC.2.RData")
The median BC-distances for the T1+T2C samples go from 0.3971 to 0.3994, while the median BC-distances for the T1 samples go from 0.3792 to 0.3806. The median BC-distance between the (reescaled) CAU_T1 and CYM_T1 samples was 0.4097.
And T2C vs T2HW, with T2C as base samples.
Sim.T2CT2HW.T2C.BC=replicate(1000,Simulation(c(3,4),"bray"))
Sim.T2C.BC=replicate(1000,Simulation(c(3),"bray"))
saveRDS(Sim.T2CT2HW.T2C.BC,file="Sim.T2CT2HW.T2C.BC.RData")
saveRDS(Sim.T2C.BC, file="Sim.T2C.BC.RData")
The median BC-distances for the T2C+T2HW samples go from 0.385 to 0.3869, while the median BC-distances for the T2C samples go from 0.4274 to 0.4288. The median BC-distance between the (reescaled) CAU_T2C and CYM_T2C samples was 0.4493
Finally, the global samples:
Sim.Global.BC=replicate(1000,Simulation(c(1,2,3,4),"bray"))
Sim.T0.BC=replicate(1000,Simulation(c(1),"bray"))
saveRDS(Sim.Global.BC, file="Sim.Global.BC.RData")
saveRDS(Sim.T0.BC, file="Sim.T0.BC.RData")
The median BC-distances bewteen CAU and CYM for the Global samples go from 0.3805 to 0.3828, while the median BC-distances for the T0 samples go from 0.4057 to 0.407. The median BC-distance between the (reescaled) CAU_T0 and CYM_T0 samples was 0.4124
For each pair of sample types considered, we proceed as follows:
We load TablaOTUs.xlsx and retain only the relevant “global” samples (all, only CAU, or only CYM).
We remove OTUs with 2 or fewer reads across the selected samples.
We remove OTUs that appear in only one of the selected samples, in order to apply Bayesian-multiplicative zero imputation.
We perform Bayesian-multiplicative zero imputation on the selected samples.
We retain only the samples corresponding to the time points to be compared.
We assess whether and how well the two types of samples are clearly separated in a hierarchical clustering.
We study how well the two types of samples separate when only a random group of OTUs of fixed size is considered, for different sizes. This provides a reference against which we evaluate the significance of the separation observed with specific groups of OTUs.
To evaluate how a group of taxa separates two groups of samples CAUx and CYMx, we use the index
\[
\frac{\text{distance between CAU${}_x$ and CYM${}_x$}}{\sqrt{(\text{dist. within CAU${}_x$})^2 + (\text{dist. within CYM${}_x$})^2}}
\]
where the distances are calculated as the heights in the hierarchical tree obtained by clustering the samples with the selected taxa.
In each one of these simulations, we fix the random seed as the last four digits of Sys.time() at the moment of execution.
We search for groups of OTUs with significantly different abundances in the two sample types by applying several methods: significance according to ALDeX (adjusted p-values from the Kruskal–Wallis test); relevant log-ratios; bacterial signatures identified with coda_glmnet; and significance according to simper.
When the different methods identify groups of OTUs that consistently separate the two sample types, we record them in a table Significant, and finally we export this table as a csv file. Its name is “OTU_Significativos_” followed by the pair of sample types.
tax_otu=data.frame(read_excel("TablaOTUs.xlsx"))
Samples=tax_otu$ID
n.OTUs.original=dim(tax_otu)[2]-2
taxa=colnames(tax_otu[,3:dim(tax_otu)[2]])
DF.0=tax_otu[,-c(1,2)]
taxa.original=taxa
OTUs=paste("OTU",1:n.OTUs.original,sep="")
colnames(DF.0)=OTUs
DF.0.total=DF.0
colnames(DF.0.total)=OTUs
rownames(DF.0.total)=Samples
rm(tax_otu)
Type=as.factor(rep(c("CAU","CYM"),each=12))
colors=c("green","blue")[Type]
#' Green: CAU
#' Blue: CYM
The OTUs that appear with 2 or fewer hits in the total sample represent 15.8% of the total.
Miseria=as.vector(which(colSums(DF.0)<=2))
DF.0=DF.0[,-Miseria]
taxa=taxa[-Miseria]
OTUs=OTUs[-Miseria]
# DF.0.hits: para cada OTU y cada muestra, 1 si OTU in Muestra y 0 si no
hit=function(x){min(c(x,1))}
DF.0.hits=as.matrix(DF.0)
DF.0.hits[]=vapply(DF.0.hits, hit, numeric(1))
OTUs that appear in only one sample but represent more than 0.05% of that sample:
Unics=data.frame(
Sample=rep(NA,length(which(colSums(DF.0.hits)==1))), OTUs=names(colSums(DF.0)[which(colSums(DF.0.hits)==1)]), Reads=colSums(DF.0)[which(colSums(DF.0.hits)==1)],
Proportions=rep(0,length(which(colSums(DF.0.hits)==1)))
)
for (i in 1:length(Unics$OTUs)){
ii=which(DF.0.hits[,Unics$OTUs[i]]==1)
Unics$Proportions[i]=DF.0[ii,Unics$OTUs[i]]/sum(DF.0[ii,])
Unics$Sample[i]=Samples[ii]}
Unics.Freqs=Unics[Unics$Proportions>=0.05/100,]
row.names(Unics.Freqs)=NULL
Unics.Freqs[rev(order(Unics.Freqs$Proportions)),] %>%
kbl() %>%
kable_styling()
| Sample | OTUs | Reads | Proportions |
|---|---|---|---|
| CAU_T0_2 | OTU966 | 55 | 0.0012183 |
We remove the OTUs that appear in only one samples and we perform Bayesian-multiplicative zero imputation.
DF.0=DF.0[,-which(colSums(DF.0.hits)==1)]
taxa=taxa[-which(colSums(DF.0.hits)==1)]
OTUs=OTUs[-which(colSums(DF.0.hits)==1)]
row.names(DF.0)=Samples
DF=zCompositions::cmultRepl(DF.0,method="GBM",output="p-counts",suppress.print=TRUE,z.warning=0.99)
row.names(DF)=Samples
rm(DF.0.hits)
rm(Miseria)
From 3088 initial taxa, we have reduced to 2542.
HC=ClusterHC(DF,Grups=Type,barplot=FALSE,colores=colors)
separacio.global=Sep(HC)
The two sample types are perfectly separated from the start. The separation between groups is 2.47.
We observe below that the two sample types are so different that almost any subset of OTUs of reasonable size classifies them correctly. This will serve as a reference to compare how much improvement we gain with the “good” sets identified later.
> initial_seed=as.integer(Sys.time())
> print (initial_seed)
# [1] 1711395644
initial_seed %% 10000
# [1] 5644
set.seed(5644)
Experimento=function(i)
{
ii=sample(dim(DF)[2],i,rep=FALSE)
DFtemp=DF[,ii]
CC=ClusterHC(DFtemp,dendrograma=FALSE,barplot=FALSE, Grups=Type,colores=colors)
c(Encerts(CC$tabla,table(Type)),Sep(CC))
}
n=250
F.SS=c()
for (i in 10:round((dim(DF)[2])/2))
{
print(i)
EE=replicate(n,Experimento(i))
EE.2=EE[2,which(EE[1,]==0)]
XX=replicate(1000,mean(sample(EE.2,n,replace=TRUE)))
F.SS=rbind(F.SS, c(i,
length(EE.2)/250,
mean(EE.2),
quantile(XX,0.025),
quantile(XX,0.975)
))
}
colnames(F.SS)=c("n","Proportion","Avg. sep.","Q_0.025","Q_0.975")
saveRDS(F.SS, file="Fraccio.Separadors.CAUvsCYM.RData")
F.SS=readRDS("Fraccio.Separadors.CAUvsCYM.RData")
The first plot shows, for each \(n\) up to half the total number of OTUs, the proportion of samples of size \(n\) that perfectly classify CAU and CYM (black curve), along with the 95% Clopper–Pearson confidence interval for that proportion (red curves). We have used smoothing splines to smooth the curves.
F.SS_prop.Q_0.025=epitools::binom.exact(F.SS[,2]*250,250)$lower
F.SS_prop.Q_0.975=epitools::binom.exact(F.SS[,2]*250,250)$upper
fit = smooth.spline(F.SS[,1],F.SS[,2],cv=TRUE)
fit.low = smooth.spline(F.SS[,1],F.SS_prop.Q_0.025,cv=TRUE)
fit.up = smooth.spline(F.SS[,1],F.SS_prop.Q_0.975,cv=TRUE)
plot(F.SS[,1:2],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Proportion",main="Proportion of perfect classifiers",xlim=c(0,1250),xaxp=c(0,1250,25),yaxp=c(0,1,10))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)
The second plot shows, for each \(n\) up to half the total number of OTUs, the estimated mean separation obtained with a sample of size \(n\) that perfectly classifies CAU and CYM (black curve), along with the 95% confidence interval of this mean separation obtained via bootstrap (red curve). This plot will serve as a reference later.
fit = smooth.spline(F.SS[,1],F.SS[,3],cv=TRUE)
fit.low = smooth.spline(F.SS[,1],F.SS[,4],cv=TRUE)
fit.up = smooth.spline(F.SS[,1],F.SS[,5],cv=TRUE)
plot(F.SS[,c(1,3)],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Separation",main="Average separation for perfect classifiers",xlim=c(0,1250),xaxp=c(0,1250,25),yaxp=c(0,2.5,10))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)
abline(h=separacio.global,col="green")
`
We always use the aldexCesc.clr version of the aldex.clr function (from the script funcionsCODAMETACIRCLE.R) to generate the simulated samples.
> initial_seed=as.integer(Sys.time())
> print (initial_seed)
# [1] 1711411362
initial_seed %% 10000
# [1] 1362
set.seed(1362)
repl.kw.Cescv2.CaCy=aldexCesc.clr(t(DF), conds=Type, mc.samples=1000, verbose=FALSE)
aldex.kw.Cescv2.CaCy=aldex.kw(repl.kw.Cescv2.CaCy, verbose=FALSE)
#'
saveRDS(aldex.kw.Cescv2.CaCy, file="aldex_kw_Cescv2_CaCy.RData")
p.valores_kw=readRDS("aldex_kw_Cescv2_CaCy.RData")
Adjusted Kruskal–Wallis p-value < 0.05
sep.kw.005=QQ.HC.noadjust(p.valores_kw,DF,Type,q=0.05,1)
kw.005=which(p.valores_kw[,2]<0.05)
There are 804 taxa with adjusted Kruskal–Wallis p-value < 0.05.
The separation between groups is 2.48.
Adjusted Kruskal–Wallis p-value < 0.01
sep.kw.001=QQ.HC.noadjust(p.valores_kw,DF,Type,q=0.01,1)
kw.001=which(p.valores_kw[,2]<0.01)
There are 605 taxa with adjusted Kruskal–Wallis p-value < 0.01.
The separation between groups is 2.35.
Adjusted Kruskal–Wallis p-value < 0.001
sep.kw.0001=QQ.HC.noadjust(p.valores_kw,DF,Type,q=0.001,1)
kw.0001=which(p.valores_kw[,2]<0.001)
There are 408 taxa with adjusted Kruskal–Wallis p-value < 0.001.
The separation between groups is 2.89.
Indicador=function(x,k=n.OTUs.original){
xx=rep(0,k)
y=as.numeric(gsub("\\D+", "", OTUs[x]))
xx[y]=1
return(xx)
}
Significant=data.frame(
OTUs=paste("OTU",1:n.OTUs.original,sep=""),
taxa=taxa.original,
Aldex.kw.0.05=Indicador(kw.005),
Aldex.kw.0.01=Indicador(kw.001),
Aldex.kw.0.001=Indicador(kw.0001)
)
We plot the assock values (see the main text) and obtain its maximum.
LRS=explore_logratios(DF,Type)
saveRDS(LRS, file="LRS_CAUCYM.RData")
LRS=readRDS("LRS_CAUCYM.RData")
assoc=rep(0,dim(DF)[2]-4)
for (m in 5:dim(DF)[2]){
assoc[m-4]=sum(LRS$`association log-ratio with y`[1:m,1:m])/m^2
}
saveRDS(assoc, file="assoc_CAUCYM.RData")
assoc=readRDS("assoc_CAUCYM.RData")
plot(5:dim(DF)[2],assoc,type="b",xlab="m",ylab="Average goodness of the m most logratio-relevant taxa."
,pch=20,cex=0.5)
m=4+which.max(assoc)
impLR=sort(as.numeric(LRS$`order of importance`[1:m]))
DF.Imp=DF[,impLR]
HC=ClusterHC(DF.Imp,dendrograma=FALSE,barplot=FALSE, Grups=Type)
LR.max=c(length(impLR),Sep(HC))
The maximum is attained at the set of 103 most logratio-relevant taxa.
They perfectly separate the two sample types. The separation between groups is 6.58.
Significant=cbind(Significant,
Log_ratio.max=Indicador(impLR))
coda_glmnet)We always apply the coda_glmnet function to the frequency matrix with zeros already imputed.
coda.glmnet=coda_glmnet(DF,Type,showPlots = FALSE)
saveRDS(coda.glmnet, file="coda.glmnet.CAUCYM.RData")
coda.glmnet=readRDS("coda.glmnet.CAUCYM.RData")
coda.glmnet$`signature plot`
impGLMNET=coda.glmnet$taxa.num
DF.Imp=DF[,impGLMNET]
HC=ClusterHC(DF.Imp,dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)
SB.glmnet=c(length(coda.glmnet$taxa.num),Sep(HC))
There are 118 significant taxa.
The separation between groups is 4.55.
Significant=cbind(Significant,
glmnet_sign.CAU=Indicador(coda.glmnet$taxa.num[coda.glmnet$`log-contrast coefficients`>0]),
glmnet_sign.CYM=Indicador(coda.glmnet$taxa.num[coda.glmnet$`log-contrast coefficients`<0]))
We always use simper from vegan on the (original) proportion matrix.
DF.prop=DF.0.total/rowSums(DF.0.total)
SMP.prop=simper(DF.prop,group=Type)
indexos_SMP.rel.prop=SMP.prop$CAU_CYM$ord[which(SMP.prop$CAU_CYM$cusum<=0.7)]
OTUs.simper.rel.prop=sort(attr(SMP.prop$CAU_CYM$cusum[which(SMP.prop$CAU_CYM$cusum<=0.7)],"names"))
DF.Imp=DF[,OTUs.simper.rel.prop]
HC=ClusterHC(DF.Imp,dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)
simper.rel.props=c(length(indexos_SMP.rel.prop),Sep(HC))
There are 93 significant taxa. They perfectly separate the two sample types. The separation between groups is 2.88.
Indicador.Gl=function(x,k=n.OTUs.original){
xx=rep(0,k)
xx[x]=1
return(xx)
}
Significant=cbind(Significant,
simper.props=Indicador.Gl(indexos_SMP.rel.prop))
Significant=cbind(Significant,
t(DF.prop))
write.csv2(Significant,"OTU_Significativos_CAUvsCYM_def.csv",row.names=FALSE)
Resultados=rbind(
sep.kw.0001,sep.kw.005,sep.kw.001,
LR.max,SB.glmnet,
simper.rel.props
)
row.names(Resultados)=c("Aldex.0.001","Aldex.0.05","Aldex.0.01",
"LR","glmnet", "simper")
colnames(Resultados)=c("Size","Separation")
par(opar)
fit = smooth.spline(F.SS[,1],F.SS[,3],cv=TRUE)
fit.low = smooth.spline(F.SS[,1],F.SS[,4],cv=TRUE)
fit.up = smooth.spline(F.SS[,1],F.SS[,5],cv=TRUE)
plot(F.SS[,c(1,3)],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Separation",main="Average separation for perfect classifiers",xaxp=c(0,1000,40),yaxp=c(0,8.5,17),ylim=c(0,8.5),xlim=c(0,1000))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)
points(Resultados,col="blue",type="h")
points(Resultados,col="blue",pch=20)
text(Resultados[,1],Resultados[,2]+0.1,col="blue",labels=row.names(Resultados),cex=0.75)
abline(h=2.47,col="green")
text(20,2.6,col="green",labels="Global",cex=0.75)
Finally, we intersect the OTUs identified as significant according to LR and coda_glmnet.
Significant.capat=Significant[,c(1,2,6,7,8)]
Significant.capat=cbind(Significant.capat,Cuánto=rowSums(Significant.capat[,3:dim(Significant.capat)[2]]))
Significant.capat=Significant.capat[Significant.capat$Cuánto==2,]
There are 41 such taxa. They are
Significant.capat[,1:2]%>%
kbl() %>%
kable_styling()
| OTUs | taxa | |
|---|---|---|
| OTU22 | OTU22 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfosarcinaceae.g__Desulfatitalea.s__uncultured_Desulfobacteraceae |
| OTU30 | OTU30 | d__Bacteria.p__Bacteroidota.c__Ignavibacteria.o__Ignavibacteriales.f__PHOS.HE36.g__PHOS.HE36.s__uncultured_bacterium |
| OTU32 | OTU32 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfosarcinaceae.guncultured. |
| OTU35 | OTU35 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfosarcinaceae.g__LCP.80.s__uncultured_delta |
| OTU38 | OTU38 | Unassigned.__.....__ |
| OTU39 | OTU39 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Sphingobacteriales.f__Lentimicrobiaceae.g__Lentimicrobiaceae.s__uncultured_Bacteroidetes |
| OTU42 | OTU42 | d__Bacteria.p__Myxococcota.c__Polyangia.o__Polyangiales.f__Sandaracinaceae.guncultured. |
| OTU45 | OTU45 | d__Bacteria.p__Desulfobacterota.c__Desulfobulbia.o__Desulfobulbales.f__Desulfocapsaceae.. |
| OTU46 | OTU46 | d__Bacteria.p__Bacteroidota.c__Ignavibacteria.o__Ignavibacteriales.f__Ignavibacteriaceae.gIgnavibacterium. |
| OTU70 | OTU70 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.oDesulfobacterales... |
| OTU109 | OTU109 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfosarcinaceae.gLCP.80. |
| OTU133 | OTU133 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfatiglandales.f__Desulfatiglandaceae.gDesulfatiglans. |
| OTU141 | OTU141 | d__Bacteria.p__Bdellovibrionota.c__Oligoflexia.o__0319.6G20.f__0319.6G20.g0319.6G20. |
| OTU213 | OTU213 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfosarcinaceae.gDesulfatitalea. |
| OTU218 | OTU218 | d__Bacteria.p__Spirochaetota.c__Leptospirae.o__Leptospirales.f__Leptospiraceae.g__RBG.16.49.21.s__uncultured_organism |
| OTU223 | OTU223 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Sphingobacteriales.f__ST.12K33.g__ST.12K33.s__uncultured_Cytophagales |
| OTU229 | OTU229 | d__Bacteria.p__Spirochaetota.c__Leptospirae.o__Leptospirales.f__Leptospiraceae.g__RBG.16.49.21.s__uncultured_bacterium |
| OTU230 | OTU230 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Cytophagales.f__Cyclobacteriaceae.g__uncultured.s__uncultured_Bacteroidetes |
| OTU253 | OTU253 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfatiglandales.f__Desulfatiglandaceae.g__Desulfatiglans.s__uncultured_delta |
| OTU255 | OTU255 | d__Bacteria.p__Spirochaetota.c__V2072.189E03.o__V2072.189E03.f__V2072.189E03.g__V2072.189E03.s__uncultured_organism |
| OTU256 | OTU256 | d__Bacteria.p__Fermentibacterota.c__Fermentibacteria.o__Fermentibacterales.f__Fermentibacteraceae.g__Fermentibacteraceae.s__uncultured_bacterium |
| OTU269 | OTU269 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Chromatiales.f__Chromatiaceae.gCandidatus_Thiobios. |
| OTU451 | OTU451 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Flavobacteriales.f__Flavobacteriaceae.gActibacter. |
| OTU498 | OTU498 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfatiglandales.f__Desulfatiglandaceae.g__Desulfatiglans.s__uncultured_organism |
| OTU527 | OTU527 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfobacteraceae.g__Desulfotignum.s__uncultured_bacterium |
| OTU761 | OTU761 | d__Bacteria.p__Actinobacteriota.c__Acidimicrobiia.o__Microtrichales.f__Microtrichaceae.gSva0996_marine_group. |
| OTU811 | OTU811 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfosarcinaceae.g__Desulfosarcina.s__uncultured_bacterium |
| OTU881 | OTU881 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Thiotrichales.f__Thiotrichaceae.g__uncultured.s__uncultured_gamma |
| OTU890 | OTU890 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Flavobacteriales.f__Flavobacteriaceae.g__Robiginitalea.s__uncultured_Bacteroidetes |
| OTU1107 | OTU1107 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfosarcinaceae.g__Sva0081_sediment_group.s__metagenome |
| OTU1329 | OTU1329 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfococcaceae.gDesulfonema. |
| OTU1390 | OTU1390 | d__Bacteria.p__Gemmatimonadota.c__PAUC43f_marine_benthic_group.o__PAUC43f_marine_benthic_group.f__PAUC43f_marine_benthic_group.g__PAUC43f_marine_benthic_group.s__saltmarsh_clone |
| OTU1419 | OTU1419 | d__Bacteria.p__NB1.j.c__NB1.j.o__NB1.j.f__NB1.j.g__NB1.j.s__uncultured_bacterium |
| OTU1443 | OTU1443 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__B2M28.f__B2M28.g__B2M28.s__uncultured_proteobacterium |
| OTU1461 | OTU1461 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Flavobacteriales.f__Flavobacteriaceae.g__Robiginitalea.s__Robiginitalea_myxolifaciens |
| OTU1532 | OTU1532 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Chromatiales.f__Chromatiaceae.g__Candidatus_Thiobios.s__uncultured_organism |
| OTU1594 | OTU1594 | d__Bacteria.p__Bacteroidota.c__Ignavibacteria.o__Ignavibacteriales.f__PHOS.HE36.gPHOS.HE36. |
| OTU1701 | OTU1701 | d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Rhizobiales.f__Methyloligellaceae.gMethyloceanibacter. |
| OTU1839 | OTU1839 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Ectothiorhodospirales.f__Thioalkalispiraceae.g__Thioalkalispira.Sulfurivermis.s__uncultured_Thioalkalispiraceae |
| OTU2338 | OTU2338 | d__Bacteria.p__Cyanobacteria.c__Cyanobacteriia.o__Cyanobacteriales.f__Xenococcaceae.gXenococcus_PCC.7305. |
| OTU2371 | OTU2371 | d__Bacteria.p__Firmicutes.c__Clostridia.o__Lachnospirales.f__Lachnospiraceae.gRoseburia. |
DF.Imp=DF[,Significant.capat[,1]]
HC=ClusterHC(DF.Imp,dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)
The separation between groups is 7.86.
We update the plot with the separations:
intersección=c(dim(Significant.capat)[1],Sep(HC))
Resultados=rbind(Resultados,intersección)
row.names(Resultados)=c("Aldex.0.001","Aldex.0.05","Aldex.0.01",
"LR","glmnet", "simper","intersection")
colnames(Resultados)=c("Size","Separation")
par(opar)
fit = smooth.spline(F.SS[,1],F.SS[,3],cv=TRUE)
fit.low = smooth.spline(F.SS[,1],F.SS[,4],cv=TRUE)
fit.up = smooth.spline(F.SS[,1],F.SS[,5],cv=TRUE)
plot(F.SS[,c(1,3)],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Separation",main="Average separation for perfect classifiers",xaxp=c(0,1000,40),yaxp=c(0,8.5,17),ylim=c(0,8.5),xlim=c(0,1000))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)
points(Resultados,col="blue",type="h")
points(Resultados,col="blue",pch=20)
text(Resultados[,1],Resultados[,2]+0.1,col="blue",labels=row.names(Resultados),cex=0.75)
abline(h=2.47,col="green")
text(20,2.6,col="green",labels="Global",cex=0.75)
Resultados %>%
kbl() %>%
kable_styling()
| Size | Separation | |
|---|---|---|
| Aldex.0.001 | 408 | 2.889614 |
| Aldex.0.05 | 804 | 2.481799 |
| Aldex.0.01 | 605 | 2.354071 |
| LR | 103 | 6.579296 |
| glmnet | 118 | 4.550477 |
| simper | 93 | 2.881239 |
| intersection | 41 | 7.863271 |
tax_otu=data.frame(read_excel("TablaOTUs.xlsx"))
Samples=tax_otu$ID
taxa=colnames(tax_otu[,3:dim(tax_otu)[2]])
OTUs=paste("OTU",1:(dim(tax_otu)[2]-2),sep="")
DF.0=tax_otu[c(1:3,13:15),-c(1,2)]
row.names(DF.0)=Samples[c(1:3,13:15)]
colnames(DF.0)=OTUs
taxa.original=taxa
DF.0.original=DF.0
n.OTUs.original=dim(DF.0)[2]
Miseria=as.vector(which(colSums(DF.0)<=2))
DF.0=DF.0[,-Miseria]
taxa=taxa[-Miseria]
OTUs=OTUs[-Miseria]
hit=function(x){min(c(x,1))}
DF.0.hits=as.matrix(DF.0)
DF.0.hits[]=vapply(DF.0.hits, hit, numeric(1))
Unics=data.frame(
Sample=rep(NA,length(which(colSums(DF.0.hits)==1))), OTUs=names(colSums(DF.0)[which(colSums(DF.0.hits)==1)]), Reads=colSums(DF.0)[which(colSums(DF.0.hits)==1)],
Proportions=rep(0,length(which(colSums(DF.0.hits)==1)))
)
for (i in 1:length(Unics$OTUs)){
ii=which(DF.0.hits[,Unics$OTUs[i]]==1)
Unics$Proportions[i]=DF.0[ii,Unics$OTUs[i]]/sum(DF.0[ii,])
Unics$Sample[i]=Samples[ii]}
Unics.Freqs=Unics[Unics$Proportions>=0.05/100,]
row.names(Unics.Freqs)=NULL
DF.0=DF.0[,-which(colSums(DF.0.hits)==1)]
taxa=taxa[-which(colSums(DF.0.hits)==1)]
OTUs=OTUs[-which(colSums(DF.0.hits)==1)]
row.names(DF.0)=Samples[c(1:3,13:15)]
DF=zCompositions::cmultRepl(DF.0,method="GBM",output="p-counts",suppress.print=TRUE,z.warning=0.99)
row.names(DF)=Samples[c(1:3,13:15)]
Type=as.factor(c(rep("CAU_T0", 3),rep("CYM_T0", 3)))
colors=c("green","blue")[Type]
After removing OTUs with 2 or fewer reads across the T0 samples and those appearing in only one of these samples, 1472 taxa remain.
The OTUs removed at this step that represent more than 0.05% of the sample in which they appear are:
Unics.Freqs[rev(order(Unics.Freqs$Proportions)),] %>%
kbl() %>%
kable_styling()
| Sample | OTUs | Reads | Proportions |
|---|---|---|---|
| CAU_T0_2 | OTU966 | 55 | 0.0012208 |
rm(tax_otu)
rm(DF.0.hits)
rm(Miseria)
rm(Unics.Freqs)
HC=ClusterHC(DF,Grups=Type,barplot=FALSE,colores=colors)
separacio.global=Sep(HC)
The two sample types are perfectly separated from the start. The separation between groups is 2.43.
We examine the proportion of OTU subsets of reasonable size that already classify them correctly.
> initial_seed=as.integer(Sys.time())
> print (initial_seed)
# [1] 1717567904
initial_seed %% 10000
# [1] 7904
set.seed(7904)
Experimento=function(i)
{
ii=sample(dim(DF)[2],i,rep=FALSE)
DFtemp=DF[,ii]
CC=ClusterHC(DFtemp,dendrograma=FALSE,barplot=FALSE, Grups=Type,colores=colors)
c(Encerts(CC$tabla,table(Type)),Sep(CC))
}
n=250
F.SS=c()
for (i in 10:750)
{
print(i)
EE=replicate(n,Experimento(i))
EE.2=EE[2,which(EE[1,]==0)]
XX=replicate(1000,mean(sample(EE.2,n,replace=TRUE)))
F.SS=rbind(F.SS, c(i,
length(EE.2)/250,
mean(EE.2),
quantile(XX,0.025),
quantile(XX,0.975)
))
}
colnames(F.SS)=c("n","proporción","sep. media","Q_0.025","Q_0.975")
saveRDS(F.SS, file="Fraccio.Separadors.T0.RData")
F.SS=readRDS("Fraccio.Separadors.T0.RData")
The first plot shows, for each \(n\) up to half the total number of OTUs, the proportion of samples of size \(n\) that perfectly classify CAU_T0 and CYM_T0 (black curve), along with the 95% Clopper–Pearson confidence interval for that proportion (red curves). We used smoothing splines to smooth the curves.
F.SS_prop.Q_0.025=epitools::binom.exact(F.SS[,2]*250,250)$lower
F.SS_prop.Q_0.975=epitools::binom.exact(F.SS[,2]*250,250)$upper
fit = smooth.spline(F.SS[,1],F.SS[,2],cv=TRUE)
fit.low = smooth.spline(F.SS[,1],F.SS_prop.Q_0.025,cv=TRUE)
fit.up = smooth.spline(F.SS[,1],F.SS_prop.Q_0.975,cv=TRUE)
plot(F.SS[,1:2],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Proportion",main="Proportion of perfect classifiers",xlim=c(0,750),xaxp=c(0,750,15),yaxp=c(0,1,10))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)
The second plot shows, for each \(n\) up to half the total number of OTUs, the estimated mean separation obtained with a sample of size \(n\) that perfectly classifies CAU_T0 and CYM_T0 (black curve), along with the 95% confidence interval of this mean separation obtained via bootstrap (red curve).
fit = smooth.spline(F.SS[,1],F.SS[,3],cv=TRUE)
fit.low = smooth.spline(F.SS[,1],F.SS[,4],cv=TRUE)
fit.up = smooth.spline(F.SS[,1],F.SS[,5],cv=TRUE)
plot(F.SS[,c(1,3)],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Separation",main="Average separation for perfect classifiers",xlim=c(0,750),xaxp=c(0,750,15),yaxp=c(0,2.5,10))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)
abline(h=separacio.global,col="green")
`
> initial_seed=as.integer(Sys.time())
> print (initial_seed)
# [1] 1717574148
initial_seed %% 10000
# [1] 4148
set.seed(4148)
repl.kw.Cescv2.T0=aldexCesc.clr(t(DF), conds=Type, mc.samples=1000, verbose=FALSE)
aldex.kw.Cescv2.T0=aldex.kw(repl.kw.Cescv2.T0, verbose=FALSE)
#'
saveRDS(aldex.kw.Cescv2.T0, file="aldex_kw_Cescv2.T0.RData")
Taxa with statistically significant differences:
p.valores_kw=readRDS("aldex_kw_Cescv2.T0.RData")
length(p.valores_kw[p.valores_kw[,2]<0.05,2]) #K-W adjusted p-value < 0.05
## [1] 0
length(p.valores_kw[p.valores_kw[,1]<0.05,2]) #p-value < 0.05
## [1] 121
length(p.valores_kw[p.valores_kw[,1]<0.01,4]) #p-value < 0.01
## [1] 0
There is no OTU with an adjusted Kruskal–Wallis test p-value < 0.05. There are 121 with an unadjusted p-value < 0.05 (and none < 0.01). They classify the samples correctly:
sep.kw.005=QQ.HC.noadjust(p.valores_kw,DF,Type,q=0.05,1,BP=FALSE)
kw.005=which(p.valores_kw[,2]<0.001)
The separation between groups is 3.43.
Significant=data.frame(
OTUs=paste("OTU",1:n.OTUs.original,sep=""),
taxa=taxa.original,
Aldex.kw.0.05=Indicador(kw.005)
)
LRS=explore_logratios(DF,Type)
saveRDS(LRS, file="LRS_T0.RData")
LRS=readRDS("LRS_T0.RData")
assoc=rep(0,dim(DF)[2]-4)
for (m in 5:dim(DF)[2]){
assoc[m-4]=sum(LRS$`association log-ratio with y`[1:m,1:m])/m^2
}
saveRDS(assoc, file="assoc_T0.RData")
assoc=readRDS("assoc_T0.RData")
plot(5:200,assoc[1:196],type="b",xlab="m",ylab="Bondad mediana de los m más relevantes"
,pch=20,cex=0.5)
m=4+which.max(assoc)
impLR=sort(as.numeric(LRS$`order of importance`[1:m]))
DF.Imp=DF[,impLR]
HC=ClusterHC(DF.Imp,dendrograma=FALSE,barplot=FALSE, Grups=Type)
LR.max=c(length(impLR),Sep(HC))
The maximum is attained at the set of 549 most logratio-relevant taxa.
They perfectly separate the two sample types. The separation between groups is 2.83.
Note In the plot we can observe a very pronounced local maximum at 46. Let us examine it:
m0=4+which.max(assoc[1:100])
impLR.2=sort(as.numeric(LRS$`order of importance`[1:m0]))
DF.Imp.0=DF[,impLR.2]
HC=ClusterHC(DF.Imp.0,dendrograma=TRUE,barplot=FALSE, Grups=Type)
LR.max.2=c(length(impLR.2),Sep(HC))
They also perfectly separate the two sample types. The separation between groups is 4.66. Fewer taxa, but better separation.
Significant=cbind(Significant,
Log_ratio.max=Indicador(impLR),
Log_ratio.max2=Indicador(impLR.2))
coda_glmnet)coda.glmnet=coda_glmnet(DF,Type,showPlots = FALSE)
saveRDS(coda.glmnet, file="coda.glmnet.T0.RData")
coda.glmnet=readRDS("coda.glmnet.T0.RData")
coda.glmnet$`signature plot`
impGLMNET=coda.glmnet$taxa.num
DF.Imp=DF[,impGLMNET]
HC=ClusterHC(DF.Imp,dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)
glmnet.no0=c(length(coda.glmnet$taxa.num),Sep(HC))
There are 350 significant taxa.
The separation between groups is 2.78.
Significant=cbind(Significant,
glmnet_sign.no0.CAU=Indicador(coda.glmnet$taxa.num[coda.glmnet$`log-contrast coefficients`>0]),
glmnet_sign.no0.CYM=Indicador(coda.glmnet$taxa.num[coda.glmnet$`log-contrast coefficients`<0]))
Out of curiosity, we use 3 replicates to see what happens.
> initial_seed=as.integer(Sys.time())
> print (initial_seed)
# [1] 1717687628
initial_seed %% 10000
# [1] 7628
M=3
set.seed(7628)
repls=aldexCesc.clr(t(DF), conds=Type, mc.samples=M)@analysisData
## [1] "operating in serial mode"
repls=t(cbind(
repls$CAU_T0_1,repls$CAU_T0_2,repls$CAU_T0_3,
repls$CYM_T0_1,repls$CYM_T0_2,repls$CYM_T0_3
))
attr(repls,"dimnames")[[2]]=NULL
colnames(repls)=colnames(DF)
row.names(repls)=c(
paste("CAU_T0_1",1:M,sep="_"),
paste("CAU_T0_2",1:M,sep="_"),
paste("CAU_T0_3",1:M,sep="_"),
paste("CYM_T0_1",1:M,sep="_"),
paste("CYM_T0_2",1:M,sep="_"),
paste("CYM_T0_3",1:M,sep="_")
)
mostres_ext=rbind(repls,easyCODA::CLR(DF)$LR)
conds_ext=as.factor(c(rep("CAU", 3*M),rep("CYM", 3*M),rep("CAU", 3),rep("CYM", 3)))
colors_ext=c("green","blue")[conds_ext]
rm(repls)
LogRat_mostres_ext_T0=logratios_matrix_clr(mostres_ext)
saveRDS(LogRat_mostres_ext_T0, file="LogRat_mostres_ext_T0.RData")
LogRat_mostres_ext_T0=readRDS("LogRat_mostres_ext_T0.RData")
LogRat_mostres_ext_T0=list(LogRat_mostres_ext_T0[[1]],LogRat_mostres_ext_T0[[2]])
coda.glmnet_ext=coda_glmnet_lr_bin(LogRat_mostres_ext_T0,conds_ext,taxa,showPlots = FALSE)
coda.glmnet_ext$`signature plot`
impGLMNET_ext=coda.glmnet_ext$taxa.num
DF.Imp=DF[,impGLMNET_ext]
HC=ClusterHC(DF.Imp,dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)
glmnet.ext=c(length(impGLMNET_ext),Sep(HC))
There are 104 significant taxa.
The separation between groups is 4.42.
Significant=cbind(Significant,
glmnet_sign.ext.CAU=Indicador(coda.glmnet_ext$taxa.num[coda.glmnet_ext$`log-contrast coefficients`>0]),
glmnet_sign.ext.CYM=Indicador(coda.glmnet_ext$taxa.num[coda.glmnet_ext$`log-contrast coefficients`<0]))
DF.prop=DF.0.total[c(1:3,13:15),]/rowSums(DF.0.total[c(1:3,13:15),])
SMP.prop=simper(DF.prop,group=Type)
indexos_SMP.07=SMP.prop$CAU_T2C_CAU_T2HW$ord[which(SMP.prop$CAU_T2C_CAU_T2HW$cusum<=0.7)]
indexos_SMP.05=SMP.prop$CAU_T2C_CAU_T2HW$ord[which(SMP.prop$CAU_T2C_CAU_T2HW$cusum<=0.5)]
length(indexos_SMP.07)
## [1] 0
length(indexos_SMP.05)
## [1] 0
We find no significant taxon.
write.csv2(Significant,"OTU_Significativos_CAUT0vsCYMT0.csv",row.names=FALSE)
Resultados=rbind(
LR.max,
LR.max.2,
sep.kw.005,
glmnet.no0,
glmnet.ext)
row.names(Resultados)=c(
"LR.max",
"LR.max.local",
"Aldex",
"glmnet",
"glmnet.ext"
)
colnames(Resultados)=c("Size","Separation")
fit = smooth.spline(F.SS[,1],F.SS[,3],cv=TRUE)
fit.low = smooth.spline(F.SS[,1],F.SS[,4],cv=TRUE)
fit.up = smooth.spline(F.SS[,1],F.SS[,5],cv=TRUE)
plot(F.SS[,c(1,3)],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Separation",main="Average separation for perfect classifiers",xlim=c(0,750),xaxp=c(0,750,15),ylim=c(0,7),yaxp=c(0,7,28))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)
abline(h=separacio.global,col="green")
points(Resultados,col="blue",type="h")
points(Resultados,col="blue",pch=20)
text(Resultados[,1],Resultados[,2]+0.1,col="blue",labels=row.names(Resultados),cex=0.75)
text(20,separacio.global+0.01,col="green",labels="Global",cex=0.75)
Finally, we intersect the OTUs identified as significant according to LR and coda_glmnet.
Significant.capat=Significant[,c(1,2,4,6,7)]
Significant.capat=cbind(Significant.capat,Cuánto=rowSums(Significant.capat[,3:dim(Significant.capat)[2]]))
Significant.capat=Significant.capat[Significant.capat$Cuánto==2,]
DF.Imp=DF[,Significant.capat[,1]]
HC=ClusterHC(DF.Imp,dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)
There are 312 taxa. They are:
Significant.capat[,1:2]%>%
kbl() %>%
kable_styling()
| OTUs | taxa | |
|---|---|---|
| 1 | OTU1 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Thiotrichales.f__Thiotrichaceae.guncultured. |
| 2 | OTU2 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Flavobacteriales.f__Flavobacteriaceae.g__Actibacter.s__uncultured_Bacteroidetes |
| 3 | OTU3 | d__Bacteria.p__Desulfobacterota.c__Desulfobulbia.o__Desulfobulbales.f__Desulfocapsaceae.g__uncultured.s__uncultured_delta |
| 4 | OTU4 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Flavobacteriales.f__Flavobacteriaceae.g__Robiginitalea.s__bacterium_AMSU |
| 7 | OTU7 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.... |
| 8 | OTU8 | d__Bacteria.p__Chloroflexi.c__Anaerolineae.o__Anaerolineales.f__Anaerolineaceae.g__uncultured.s__uncultured_organism |
| 9 | OTU9 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfosarcinaceae.gSva0081_sediment_group. |
| 10 | OTU10 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Cytophagales.f__Cyclobacteriaceae.guncultured. |
| 12 | OTU12 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Bacteroidales.f__Bacteroidetes_BD2.2.gBacteroidetes_BD2.2. |
| 13 | OTU13 | d__Bacteria.p__Desulfobacterota.c__Desulfobulbia.o__Desulfobulbales.f__Desulfocapsaceae.guncultured. |
| 15 | OTU15 | d__Bacteria.p__Firmicutes.c__Clostridia.o__Clostridia.f__Hungateiclostridiaceae.g__uncultured.s__uncultured_bacterium |
| 16 | OTU16 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfosarcinaceae.. |
| 18 | OTU18 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Thiotrichales.f__Thiotrichaceae.g__uncultured.s__uncultured_sediment |
| 19 | OTU19 | d__Bacteria.p__Chloroflexi.c__Anaerolineae.o__Anaerolineales.f__Anaerolineaceae.guncultured. |
| 20 | OTU20 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Flavobacteriales.f__Flavobacteriaceae.g__Robiginitalea.s__uncultured_bacterium |
| 22 | OTU22 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfosarcinaceae.g__Desulfatitalea.s__uncultured_Desulfobacteraceae |
| 23 | OTU23 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Sphingobacteriales.f__Lentimicrobiaceae.gLentimicrobiaceae. |
| 24 | OTU24 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.oBacteroidales... |
| 25 | OTU25 | d__Archaea.p__Asgardarchaeota.c__Heimdallarchaeia.o__Heimdallarchaeia.f__Heimdallarchaeia.g__Heimdallarchaeia.s__uncultured_archaeon |
| 26 | OTU26 | d__Bacteria.p__Chloroflexi.c__Anaerolineae.o__SBR1031.f__SBR1031.g__SBR1031.s__uncultured_organism |
| 28 | OTU28 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Chromatiales.f__Chromatiaceae.g__Candidatus_Thiobios.s__uncultured_sediment |
| 29 | OTU29 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Milano.WF1B.44.f__Milano.WF1B.44.g__Milano.WF1B.44.s__uncultured_gamma |
| 30 | OTU30 | d__Bacteria.p__Bacteroidota.c__Ignavibacteria.o__Ignavibacteriales.f__PHOS.HE36.g__PHOS.HE36.s__uncultured_bacterium |
| 31 | OTU31 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.oEctothiorhodospirales... |
| 32 | OTU32 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfosarcinaceae.guncultured. |
| 33 | OTU33 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Bacteroidetes_VC2.1_Bac22.f__Bacteroidetes_VC2.1_Bac22.g__Bacteroidetes_VC2.1_Bac22.s__uncultured_Bacteroidetes |
| 35 | OTU35 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfosarcinaceae.g__LCP.80.s__uncultured_delta |
| 36 | OTU36 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Steroidobacterales.f__Woeseiaceae.gWoeseia. |
| 37 | OTU37 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__KI89A_clade.f__KI89A_clade.gKI89A_clade. |
| 38 | OTU38 | Unassigned.__.....__ |
| 39 | OTU39 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Sphingobacteriales.f__Lentimicrobiaceae.g__Lentimicrobiaceae.s__uncultured_Bacteroidetes |
| 40 | OTU40 | d__Bacteria.p__Chloroflexi.c__Anaerolineae.o__Anaerolineales.f__Anaerolineaceae.g__Pelolinea.s__uncultured_bacterium |
| 41 | OTU41 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__B2M28.f__B2M28.gB2M28. |
| 42 | OTU42 | d__Bacteria.p__Myxococcota.c__Polyangia.o__Polyangiales.f__Sandaracinaceae.guncultured. |
| 43 | OTU43 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__uncultured.f__uncultured.g__uncultured.s__uncultured_Thioalkalivibrio |
| 44 | OTU44 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfosarcinaceae.gSEEP.SRB1. |
| 45 | OTU45 | d__Bacteria.p__Desulfobacterota.c__Desulfobulbia.o__Desulfobulbales.f__Desulfocapsaceae.. |
| 47 | OTU47 | d__Archaea.p__Nanoarchaeota.c__Nanoarchaeia.o__Woesearchaeales.f__SCGC_AAA011.D5.g__SCGC_AAA011.D5.s__uncultured_euryarchaeote |
| 49 | OTU49 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Chromatiales.f__Sedimenticolaceae.g__Candidatus_Thiodiazotropha.s__Candidatus_Thiodiazotropha |
| 50 | OTU50 | d__Bacteria.p__Sva0485.c__Sva0485.o__Sva0485.f__Sva0485.gSva0485. |
| 52 | OTU52 | d__Bacteria.p__Actinobacteriota.c__Acidimicrobiia.o__Actinomarinales.f__uncultured.guncultured. |
| 53 | OTU53 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__uncultured.f__uncultured.guncultured. |
| 54 | OTU54 | d__Bacteria.p__Actinobacteriota.c__Acidimicrobiia.o__Microtrichales.f__Ilumatobacteraceae.gIlumatobacter. |
| 55 | OTU55 | d__Bacteria.p__Calditrichota.c__Calditrichia.o__Calditrichales.f__Calditrichaceae.g__Caldithrix.s__uncultured_bacterium |
| 56 | OTU56 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Arenicellales.f__Arenicellaceae.guncultured. |
| 57 | OTU57 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Flavobacteriales.f__Flavobacteriaceae.gEudoraea. |
| 60 | OTU60 | d__Bacteria.p__Calditrichota.c__Calditrichia.o__Calditrichales.f__Calditrichaceae.g__Calditrichaceae.s__uncultured_bacterium |
| 62 | OTU62 | d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Rhodobacterales.f__Rhodobacteraceae.. |
| 64 | OTU64 | d__Eukaryota...... |
| 69 | OTU69 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Chromatiales.f__Sedimenticolaceae.. |
| 70 | OTU70 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.oDesulfobacterales... |
| 71 | OTU71 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Flavobacteriales.f__Flavobacteriaceae.gRobiginitalea. |
| 72 | OTU72 | d__Bacteria.p__Calditrichota.c__Calditrichia.o__Calditrichales.f__Calditrichaceae.g__Calorithrix.s__uncultured_bacterium |
| 73 | OTU73 | d__Bacteria.p__Bacteroidota.c__Kryptonia.o__Kryptoniales.f__BSV26.gBSV26. |
| 77 | OTU77 | d__Archaea.p__Thermoplasmatota.c__Thermoplasmata.o__uncultured.f__uncultured.g__uncultured.s__uncultured_Thermoplasmatales |
| 79 | OTU79 | d__Bacteria.p__Acidobacteriota.c__Subgroup_21.o__Subgroup_21.f__Subgroup_21.g__Subgroup_21.s__uncultured_bacterium |
| 80 | OTU80 | d__Bacteria...... |
| 81 | OTU81 | d__Bacteria.p__Latescibacterota.c__Latescibacterota.o__Latescibacterota.f__Latescibacterota.g__Latescibacterota.s__uncultured_bacterium |
| 82 | OTU82 | d__Bacteria.p__Calditrichota.c__Calditrichia.o__Calditrichales.f__Calditrichaceae.g__Caldithrix.s__uncultured_delta |
| 83 | OTU83 | d__Bacteria.p__SAR324_clade.Marine_group_B..c__SAR324_clade.Marine_group_B..o__SAR324_clade.Marine_group_B..f__SAR324_clade.Marine_group_B..gSAR324_clade.Marine_group_B.. |
| 84 | OTU84 | d__Bacteria.p__Latescibacterota.c__Latescibacteria.o__Latescibacterales.f__Latescibacteraceae.g__Latescibacteraceae.s__uncultured_bacterium |
| 85 | OTU85 | d__Bacteria.pChloroflexi..... |
| 88 | OTU88 | d__Bacteria.p__Spirochaetota.c__Spirochaetia.o__Spirochaetales.f__Spirochaetaceae.gSpirochaeta_2. |
| 94 | OTU94 | d__Bacteria.p__Spirochaetota.c__Spirochaetia.o__Spirochaetales.f__Spirochaetaceae.g__Spirochaeta.s__uncultured_bacterium |
| 95 | OTU95 | d__Bacteria.p__Cloacimonadota.c__Cloacimonadia.o__Cloacimonadales.f__MSBL8.g__MSBL8.s__uncultured_bacterium |
| 98 | OTU98 | d__Bacteria.p__Acetothermia.c__Acetothermiia.o__Acetothermiia.f__Acetothermiia.gAcetothermiia. |
| 102 | OTU102 | d__Bacteria.p__Bacteroidota.c__Ignavibacteria.o__Ignavibacteriales.f__Melioribacteraceae.g__IheB3.7.s__uncultured_Chlorobi |
| 104 | OTU104 | d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Rhizobiales.f__Hyphomicrobiaceae.. |
| 105 | OTU105 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Chromatiales.f__Chromatiaceae.gHalochromatium. |
| 108 | OTU108 | d__Bacteria.p__Spirochaetota.c__Spirochaetia.o__Spirochaetales.f__Spirochaetaceae.gGWE2.31.10. |
| 111 | OTU111 | d__Bacteria.p__Chloroflexi.c__Anaerolineae.o__Ardenticatenales.f__uncultured.guncultured. |
| 113 | OTU113 | d__Archaea.p__Altiarchaeota.c__Altiarchaeia.o__Altiarchaeales.f__Altiarchaeaceae.g__Candidatus_Altiarchaeum.s__uncultured_archaeon |
| 116 | OTU116 | d__Bacteria.p__Zixibacteria.c__Zixibacteria.o__Zixibacteria.f__Zixibacteria.g__Zixibacteria.s__uncultured_bacterium |
| 118 | OTU118 | d__Archaea.p__Thermoplasmatota.c__Thermoplasmata.o__Marine_Benthic_Group_D_and_DHVEG.1.f__Marine_Benthic_Group_D_and_DHVEG.1.g__Marine_Benthic_Group_D_and_DHVEG.1.s__uncultured_archaeon |
| 119 | OTU119 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Bacteroidales.f__Marinilabiliaceae.g__uncultured.s__uncultured_Bacteroidetes |
| 120 | OTU120 | d__Bacteria.p__Desulfobacterota.c__Desulfobulbia.o__Desulfobulbales.f__Desulfobulbaceae.gDesulfobulbus. |
| 122 | OTU122 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Bacteroidales.f__Marinilabiliaceae.guncultured. |
| 123 | OTU123 | d__Bacteria.p__Planctomycetota.c__Planctomycetes.o__Pirellulales.f__Pirellulaceae.gPir4_lineage. |
| 124 | OTU124 | d__Bacteria.p__Spirochaetota.c__Spirochaetia.o__Spirochaetales.f__Spirochaetaceae.g__GWE2.31.10.s__uncultured_gamma |
| 130 | OTU130 | d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__SAR11_clade.f__Clade_III.gClade_III. |
| 132 | OTU132 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__BD7.8.f__BD7.8.gBD7.8. |
| 133 | OTU133 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfatiglandales.f__Desulfatiglandaceae.gDesulfatiglans. |
| 135 | OTU135 | d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__SAR11_clade.f__Clade_I.gClade_Ia. |
| 136 | OTU136 | d__Bacteria.p__Patescibacteria.c__Parcubacteria.o__Candidatus_Kaiserbacteria.f__Candidatus_Kaiserbacteria.g__Candidatus_Kaiserbacteria.s__uncultured_bacterium |
| 139 | OTU139 | d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Micavibrionales.f__Micavibrionaceae.g__uncultured.s__uncultured_bacterium |
| 140 | OTU140 | d__Bacteria.p__Cyanobacteria.c__Cyanobacteriia.o__Cyanobacteriales.f__Cyanobacteriaceae.. |
| 144 | OTU144 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Flavobacteriales.f__Flavobacteriaceae.gAquibacter. |
| 146 | OTU146 | d__Bacteria.p__Spirochaetota.c__Spirochaetia.o__Spirochaetales.f__Spirochaetaceae.g__GWE2.31.10.s__uncultured_bacterium |
| 147 | OTU147 | d__Bacteria.p__Nitrospirota.c__Thermodesulfovibrionia.o__uncultured.f__uncultured.g__uncultured.s__uncultured_Nitrospirae |
| 148 | OTU148 | d__Bacteria.p__Verrucomicrobiota.c__Kiritimatiellae.o__Kiritimatiellales.f__Kiritimatiellaceae.gR76.B128. |
| 149 | OTU149 | d__Bacteria.p__Acidobacteriota.c__Subgroup_18.o__Subgroup_18.f__Subgroup_18.gSubgroup_18. |
| 150 | OTU150 | d__Archaea...... |
| 152 | OTU152 | d__Bacteria.p__Verrucomicrobiota.c__Verrucomicrobiae.o__Pedosphaerales.f__Pedosphaeraceae.g__DEV008.s__uncultured_bacterium |
| 153 | OTU153 | d__Archaea.p__Thermoplasmatota.c__Thermoplasmata.o__uncultured.f__uncultured.g__uncultured.s__uncultured_archaeon |
| 155 | OTU155 | d__Bacteria.p__Bacteroidota.c__Ignavibacteria.o__Ignavibacteriales.f__Melioribacteraceae.g__IheB3.7.s__uncultured_sediment |
| 156 | OTU156 | d__Bacteria.p__Spirochaetota.c__Spirochaetia.o__Spirochaetales.f__Spirochaetaceae.g__Spirochaeta_2.s__uncultured_bacterium |
| 157 | OTU157 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfosarcinaceae.g__SEEP.SRB1.s__Olavius_ilvae |
| 158 | OTU158 | d__Bacteria.p__Spirochaetota.c__Spirochaetia.o__Spirochaetales.f__Spirochaetaceae.g__Sediminispirochaeta.s__uncultured_bacterium |
| 159 | OTU159 | d__Bacteria.pAcidobacteriota..... |
| 160 | OTU160 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Chitinophagales.f__Saprospiraceae.g__uncultured.s__uncultured_Alphaproteobacteria |
| 161 | OTU161 | d__Bacteria.p__Chloroflexi.c__Anaerolineae.o__Caldilineales.f__Caldilineaceae.guncultured. |
| 162 | OTU162 | d__Bacteria.p__Gemmatimonadota.c__PAUC43f_marine_benthic_group.o__PAUC43f_marine_benthic_group.f__PAUC43f_marine_benthic_group.g__PAUC43f_marine_benthic_group.s__uncultured_actinobacterium |
| 168 | OTU168 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Thiomicrospirales.f__Thiomicrospiraceae.gendosymbionts. |
| 169 | OTU169 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Gammaproteobacteria_Incertae_Sedis.f__Unknown_Family.guncultured. |
| 173 | OTU173 | d__Bacteria.p__Calditrichota.c__Calditrichia.o__Calditrichales.f__Calditrichaceae.g__Caldithrix.s__uncultured_Deferribacteres |
| 175 | OTU175 | d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Caulobacterales.f__Hyphomonadaceae.. |
| 181 | OTU181 | d__Bacteria.p__Chloroflexi.c__Anaerolineae.... |
| 185 | OTU185 | d__Bacteria.p__Patescibacteria.c__WWE3.o__WWE3.f__WWE3.g__WWE3.s__uncultured_bacterium |
| 189 | OTU189 | d__Bacteria.p__Firmicutes.c__Clostridia.o__Lachnospirales.f__Defluviitaleaceae.g__Defluviitaleaceae_UCG.011.s__uncultured_bacterium |
| 190 | OTU190 | d__Bacteria.p__Myxococcota.c__Polyangia.o__Polyangiales.f__Sandaracinaceae.g__uncultured.s__bacterium_enrichment |
| 191 | OTU191 | d__Archaea.p__Nanoarchaeota.c__Nanoarchaeia.o__Woesearchaeales.f__Woesearchaeales.g__Woesearchaeales.s__uncultured_crenarchaeote |
| 192 | OTU192 | d__Bacteria.p__Planctomycetota.c__Planctomycetes.o__Pirellulales.f__Pirellulaceae.gRhodopirellula. |
| 194 | OTU194 | d__Bacteria.p__Spirochaetota.c__Spirochaetia.o__Spirochaetales.f__Spirochaetaceae.g__Spirochaeta_2.s__Spirochaeta_isovalerica |
| 204 | OTU204 | d__Bacteria.p__Chloroflexi.c__Anaerolineae.o__SBR1031.f__SBR1031.g__SBR1031.s__metagenome |
| 205 | OTU205 | d__Bacteria.p__Spirochaetota.c__Spirochaetia.o__Spirochaetales.f__Spirochaetaceae.gSediminispirochaeta. |
| 208 | OTU208 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Flavobacteriales.f__Flavobacteriaceae.gLutibacter. |
| 211 | OTU211 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfosarcinaceae.g__Desulfosarcina.s__uncultured_marine |
| 213 | OTU213 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfosarcinaceae.gDesulfatitalea. |
| 216 | OTU216 | d__Bacteria.p__Actinobacteriota.c__WCHB1.81.o__WCHB1.81.f__WCHB1.81.g__WCHB1.81.s__uncultured_bacterium |
| 217 | OTU217 | d__Bacteria.p__Desulfobacterota.c__Desulfobulbia.o__Desulfobulbales.f__Desulfobulbaceae.g__Desulfobulbus.s__uncultured_bacterium |
| 222 | OTU222 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Bacteroidales.f__Prolixibacteraceae.gSunxiuqinia. |
| 226 | OTU226 | d__Bacteria.p__Chloroflexi.c__Anaerolineae.o__KZNMV.5.B42.f__KZNMV.5.B42.gKZNMV.5.B42. |
| 228 | OTU228 | d__Bacteria.p__Verrucomicrobiota.c__Lentisphaeria.o__Oligosphaerales.f__Lenti.02.g__Lenti.02.s__uncultured_bacterium |
| 230 | OTU230 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Cytophagales.f__Cyclobacteriaceae.g__uncultured.s__uncultured_Bacteroidetes |
| 231 | OTU231 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Chromatiales.f__Chromatiaceae.. |
| 232 | OTU232 | d__Bacteria.p__Chloroflexi.c__KD4.96.o__KD4.96.f__KD4.96.gKD4.96. |
| 236 | OTU236 | d__Bacteria.p__Bacteroidota.c__Kryptonia.o__Kryptoniales.f__MSB.3C8.gMSB.3C8. |
| 237 | OTU237 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Gammaproteobacteria_Incertae_Sedis.f__Unknown_Family.g__uncultured.s__uncultured_marine |
| 239 | OTU239 | d__Bacteria.p__Zixibacteria.c__Zixibacteria.o__Zixibacteria.f__Zixibacteria.gZixibacteria. |
| 242 | OTU242 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Gammaproteobacteria_Incertae_Sedis.f__Unknown_Family.g__Unknown_Family.s__uncultured_bacterium |
| 249 | OTU249 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Chromatiales.f__Sedimenticolaceae.guncultured. |
| 252 | OTU252 | d__Bacteria.p__Verrucomicrobiota.c__Verrucomicrobiae.o__Verrucomicrobiales.f__DEV007.gDEV007. |
| 253 | OTU253 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfatiglandales.f__Desulfatiglandaceae.g__Desulfatiglans.s__uncultured_delta |
| 260 | OTU260 | d__Bacteria.p__Planctomycetota.c__Phycisphaerae.o__CCM11a.f__CCM11a.g__CCM11a.s__uncultured_organism |
| 266 | OTU266 | d__Bacteria.p__Patescibacteria.c__ABY1.o__Candidatus_Kerfeldbacteria.f__Candidatus_Kerfeldbacteria.g__Candidatus_Kerfeldbacteria.s__uncultured_bacterium |
| 267 | OTU267 | d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Caulobacterales.f__Hyphomonadaceae.guncultured. |
| 274 | OTU274 | d__Bacteria.p__Chloroflexi.c__Anaerolineae.o__Anaerolineales.f__Anaerolineaceae.g__Thermomarinilinea.s__uncultured_bacterium |
| 276 | OTU276 | d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.... |
| 277 | OTU277 | d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Kiloniellales.f__Kiloniellaceae.gTistlia. |
| 284 | OTU284 | d__Bacteria.p__LCP.89.c__LCP.89.o__LCP.89.f__LCP.89.g__LCP.89.s__saltmarsh_clone |
| 292 | OTU292 | d__Archaea.p__Thermoplasmatota.c__Thermoplasmatota.o__Thermoplasmatota.f__Thermoplasmatota.g__uncultured.s__uncultured_archaeon |
| 293 | OTU293 | d__Archaea.p__Aenigmarchaeota.c__Deep_Sea_Euryarchaeotic_Group.DSEG..o__Deep_Sea_Euryarchaeotic_Group.DSEG..f__Deep_Sea_Euryarchaeotic_Group.DSEG..g__Deep_Sea_Euryarchaeotic_Group.DSEG..s__uncultured_archaeon |
| 300 | OTU300 | d__Bacteria.p__Spirochaetota.c__Spirochaetia.o__Spirochaetales.f__Spirochaetaceae.g__Sediminispirochaeta.s__uncultured_Spirochaetes |
| 304 | OTU304 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Flavobacteriales.f__Flavobacteriaceae.gMaritimimonas. |
| 305 | OTU305 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Chitinophagales.f__Saprospiraceae.g__uncultured.s__uncultured_organism |
| 307 | OTU307 | d__Archaea.p__Nanoarchaeota.c__Nanoarchaeia.oWoesearchaeales... |
| 314 | OTU314 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__UBA10353_marine_group.f__UBA10353_marine_group.g__UBA10353_marine_group.s__uncultured_bacterium |
| 318 | OTU318 | d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Rhizobiales.f__Hyphomicrobiaceae.g__Filomicrobium.s__uncultured_Alphaproteobacteria |
| 328 | OTU328 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfosarcinaceae.g__Desulfatirhabdium.s__uncultured_delta |
| 334 | OTU334 | d__Bacteria.p__Planctomycetota.c__Phycisphaerae.o__MSBL9.f__SG8.4.gSG8.4. |
| 337 | OTU337 | d__Bacteria.p__Chloroflexi.c__Anaerolineae.o__SJA.15.f__SJA.15.g__SJA.15.s__uncultured_Chloroflexi |
| 340 | OTU340 | d__Bacteria.p__Patescibacteria.c__Gracilibacteria.o__Absconditabacteriales_.SR1..f_Absconditabacteriales.SR1..g_Absconditabacteriales.SR1..s__uncultured_bacterium |
| 341 | OTU341 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Flavobacteriales.f__Crocinitomicaceae.g__Crocinitomix.s__uncultured_bacterium |
| 353 | OTU353 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Oceanospirillales.f__Nitrincolaceae.guncultured. |
| 354 | OTU354 | d__Bacteria.p__Spirochaetota.c__Spirochaetia.o__Spirochaetales.f__Spirochaetaceae.g__Spirochaeta_2.s__uncultured_Spirochaetes |
| 356 | OTU356 | d__Bacteria.p__Planctomycetota.c__Phycisphaerae.o__CCM11a.f__CCM11a.g__CCM11a.s__uncultured_bacterium |
| 358 | OTU358 | d__Bacteria.p__Elusimicrobiota.c__Lineage_IIb.o__Lineage_IIb.f__Lineage_IIb.g__Lineage_IIb.s__uncultured_bacterium |
| 368 | OTU368 | d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Rhodobacterales.f__Rhodobacteraceae.gHIMB11. |
| 371 | OTU371 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Flavobacteriales.f__Flavobacteriaceae.g__Maritimimonas.s__uncultured_organism |
| 372 | OTU372 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Oceanospirillales.f__Pseudohongiellaceae.gPseudohongiella. |
| 381 | OTU381 | d__Bacteria.p__Patescibacteria.c__Gracilibacteria.o__Candidatus_Peregrinibacteria.f__Candidatus_Peregrinibacteria.g__Candidatus_Peregrinibacteria.s__uncultured_bacterium |
| 386 | OTU386 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Vibrionales.f__Vibrionaceae.gVibrio. |
| 389 | OTU389 | d__Bacteria.p__Bdellovibrionota.c__Bdellovibrionia.o__Bacteriovoracales.f__Bacteriovoracaceae.g__Peredibacter.s__uncultured_bacterium |
| 390 | OTU390 | d__Bacteria.p__Latescibacterota.c__Latescibacterota.o__Latescibacterota.f__Latescibacterota.g__Latescibacterota.s__uncultured_organism |
| 392 | OTU392 | d__Archaea.p__Asgardarchaeota.c__Odinarchaeia.o__Odinarchaeia.f__Odinarchaeia.g__Odinarchaeia.s__uncultured_marine |
| 393 | OTU393 | d__Bacteria.p__Latescibacterota.c__Latescibacteria.o__Latescibacterales.f__Latescibacteraceae.g__Latescibacteraceae.s__uncultured_actinobacterium |
| 394 | OTU394 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfosarcinaceae.g__Sva0081_sediment_group.s__uncultured_marine |
| 397 | OTU397 | d__Bacteria.p__NB1.j.c__NB1.j.o__NB1.j.f__NB1.j.gNB1.j. |
| 400 | OTU400 | d__Bacteria.p__Planctomycetota.c__Phycisphaerae.o__CCM11a.f__CCM11a.gCCM11a. |
| 401 | OTU401 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Bacteroidales.f__Marinilabiliaceae.g__Labilibacter.s__uncultured_Bacteroidetes |
| 402 | OTU402 | d__Bacteria.p__Acetothermia.c__Acetothermiia.o__Acetothermiia.f__Acetothermiia.g__Acetothermiia.s__uncultured_organism |
| 421 | OTU421 | d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Micavibrionales.f__uncultured.guncultured. |
| 424 | OTU424 | d__Bacteria.p__Gemmatimonadota.c__PAUC43f_marine_benthic_group.o__PAUC43f_marine_benthic_group.f__PAUC43f_marine_benthic_group.gPAUC43f_marine_benthic_group. |
| 426 | OTU426 | d__Bacteria.p__Verrucomicrobiota.c__Omnitrophia.o__Omnitrophales.f__Omnitrophaceae.g__Candidatus_Omnitrophus.s__uncultured_bacterium |
| 435 | OTU435 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfobacteraceae.gDesulfobacter. |
| 436 | OTU436 | d__Bacteria.p__Desulfobacterota.c__Desulfuromonadia.o__Bradymonadales.f__Bradymonadales.gBradymonadales. |
| 448 | OTU448 | d__Bacteria.p__Patescibacteria.c__ABY1.o__Candidatus_Falkowbacteria.f__Candidatus_Falkowbacteria.gCandidatus_Falkowbacteria. |
| 453 | OTU453 | d__Bacteria.p__Modulibacteria.c__Moduliflexia.o__Moduliflexales.f__Moduliflexaceae.gModuliflexaceae. |
| 463 | OTU463 | d__Bacteria.p__Chloroflexi.c__Anaerolineae.o__RBG.13.54.9.f__RBG.13.54.9.g__RBG.13.54.9.s__uncultured_bacterium |
| 475 | OTU475 | d__Bacteria.p__Chloroflexi.c__Dehalococcoidia.o__GIF9.f__AB.539.J10.g__SCGC.AB.539.J10.s__uncultured_bacterium |
| 476 | OTU476 | d__Bacteria.p__Myxococcota.c__Polyangia.o__MidBa8.f__MidBa8.g__MidBa8.s__uncultured_delta |
| 480 | OTU480 | d__Bacteria.p__Calditrichota.c__Calditrichia.o__Calditrichales.f__Calditrichaceae.g__Calditrichaceae.s__uncultured_organism |
| 482 | OTU482 | d__Bacteria.p__Cyanobacteria.c__Cyanobacteriia.o__Chloroplast.f__Chloroplast.g__Chloroplast.s__Chromerida_sp. |
| 483 | OTU483 | d__Bacteria.p__Bdellovibrionota.c__Oligoflexia.o__Oligoflexales.f__uncultured.g__uncultured.s__uncultured_organism |
| 490 | OTU490 | d__Bacteria.p__Bdellovibrionota.c__Bdellovibrionia.o__Bacteriovoracales.f__Bacteriovoracaceae.gPeredibacter. |
| 495 | OTU495 | d__Archaea.p__Micrarchaeota.c__Micrarchaeia.o__Micrarchaeales.f__Micrarchaeales.g__Micrarchaeales.s__uncultured_euryarchaeote |
| 501 | OTU501 | d__Bacteria.p__Latescibacterota.c__Latescibacterota.o__Latescibacterota.f__Latescibacterota.g__Latescibacterota.s__uncultured_Bacteroidetes |
| 503 | OTU503 | d__Bacteria.p__Fibrobacterota.c__Chitinivibrionia.o__Chitinivibrionales.f__Chitinivibrionaceae.g__possible_genus_03.s__uncultured_bacterium |
| 508 | OTU508 | d__Bacteria.p__Verrucomicrobiota.c__Verrucomicrobiae.o__Verrucomicrobiales.f__DEV007.g__DEV007.s__uncultured_organism |
| 509 | OTU509 | d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Rhodospirillales.f__Rhodospirillaceae.g__Candidatus_Riegeria.s__uncultured_bacterium |
| 530 | OTU530 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Thiomicrospirales.f__Thioglobaceae.gSUP05_cluster. |
| 532 | OTU532 | d__Bacteria.p__Acetothermia.c__Acetothermiia.o__Acetothermiia.f__Acetothermiia.g__Acetothermiia.s__bacterium_enrichment |
| 537 | OTU537 | d__Bacteria.p__Fermentibacterota.c__Fermentibacteria.o__Fermentibacterales.f__Fermentibacteraceae.g__Fermentibacteraceae.s__uncultured_sediment |
| 539 | OTU539 | d__Bacteria.p__Patescibacteria.c__Parcubacteria.o__Candidatus_Kaiserbacteria.f__Candidatus_Kaiserbacteria.gCandidatus_Kaiserbacteria. |
| 541 | OTU541 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Ga0077536.f__Ga0077536.g__Ga0077536.s__uncultured_gamma |
| 547 | OTU547 | d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Rhizobiales.f__Rhizobiales_Incertae_Sedis.gAnderseniella. |
| 553 | OTU553 | d__Bacteria.p__Campilobacterota.c__Campylobacteria.o__Campylobacterales.f__Helicobacteraceae.. |
| 554 | OTU554 | d__Bacteria.p__Chloroflexi.c__Dehalococcoidia.o__GIF9.f__AB.539.J10.g__AB.539.J10.s__uncultured_bacterium |
| 565 | OTU565 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Chitinophagales.f__Saprospiraceae.g__Portibacter.s__uncultured_Bacteroidetes |
| 568 | OTU568 | d__Bacteria.p__Margulisbacteria.c__Margulisbacteria.o__Margulisbacteria.f__Margulisbacteria.g__Margulisbacteria.s__uncultured_bacterium |
| 570 | OTU570 | d__Bacteria.p__Chloroflexi.c__Anaerolineae.o__Thermoflexales.f__Thermoflexaceae.gThermoflexus. |
| 574 | OTU574 | d__Bacteria.p__Planctomycetota.c__Planctomycetes.o__Pirellulales.f__Pirellulaceae.g__uncultured.s__uncultured_organism |
| 585 | OTU585 | d__Bacteria.p__Bacteroidota.c__Kryptonia.o__Kryptoniales.f__MSB.3C8.g__MSB.3C8.s__uncultured_bacterium |
| 618 | OTU618 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Ectothiorhodospirales.f__Ectothiorhodospiraceae.g__Thiogranum.s__uncultured_gamma |
| 620 | OTU620 | d__Bacteria.p__Firmicutes.c__Clostridia.o__Lachnospirales.f__Lachnospiraceae.g__uncultured.s__Clostridiales_bacterium |
| 622 | OTU622 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Chitinophagales.f__uncultured.g__uncultured.s__uncultured_Bacteroidetes |
| 628 | OTU628 | d__Bacteria.p__Campilobacterota.c__Campylobacteria.o__Campylobacterales.f__Rs.M59_termite_group.g__Rs.M59_termite_group.s__uncultured_bacterium |
| 631 | OTU631 | d__Bacteria.p__Campilobacterota.c__Campylobacteria.o__Campylobacterales.f__Campylobacteraceae.g__Campylobacter.s__uncultured_Epsilonproteobacteria |
| 657 | OTU657 | d__Bacteria.p__Actinobacteriota.c__Coriobacteriia.o__OPB41.f__OPB41.g__OPB41.s__unidentified_marine |
| 663 | OTU663 | d__Bacteria.p__Acidobacteriota.c__Vicinamibacteria.o__Subgroup_9.f__Subgroup_9.g__Subgroup_9.s__uncultured_bacterium |
| 694 | OTU694 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Flavobacteriales.f__Flavobacteriaceae.g__NS5_marine_group.s__uncultured_Flavobacterium |
| 700 | OTU700 | d__Bacteria.p__SAR324_clade.Marine_group_B..c__SAR324_clade.Marine_group_B..o__SAR324_clade.Marine_group_B..f__SAR324_clade.Marine_group_B..g__SAR324_clade.Marine_group_B..s__uncultured_organism |
| 706 | OTU706 | d__Bacteria.p__Myxococcota.c__Polyangia.o__UASB.TL25.f__UASB.TL25.gUASB.TL25. |
| 708 | OTU708 | d__Bacteria.p__Patescibacteria.c__Gracilibacteria.o__Gracilibacteria.f__Gracilibacteria.g__Gracilibacteria.s__uncultured_organism |
| 714 | OTU714 | d__Bacteria.p__Verrucomicrobiota.c__Omnitrophia.o__Omnitrophales.f__Omnitrophales.g__Omnitrophales.s__uncultured_bacterium |
| 729 | OTU729 | d__Bacteria.p__Patescibacteria.c__Parcubacteria.o__Candidatus_Kaiserbacteria.f__Candidatus_Kaiserbacteria.g__Candidatus_Kaiserbacteria.s__uncultured_organism |
| 742 | OTU742 | d__Bacteria.p__Planctomycetota.c__Pla4_lineage.o__Pla4_lineage.f__Pla4_lineage.g__Pla4_lineage.s__uncultured_bacterium |
| 750 | OTU750 | d__Bacteria.p__Desulfobacterota.c__Syntrophobacteria.o__Syntrophobacterales.f__uncultured.g__uncultured.s__uncultured_delta |
| 761 | OTU761 | d__Bacteria.p__Actinobacteriota.c__Acidimicrobiia.o__Microtrichales.f__Microtrichaceae.gSva0996_marine_group. |
| 776 | OTU776 | d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Rhodospirillales.f__Terasakiellaceae.guncultured. |
| 789 | OTU789 | d__Bacteria.p__Desulfobacterota.c__Desulfobulbia.o__Desulfobulbales.f__Desulfocapsaceae.g__Desulfopila.s__uncultured_bacterium |
| 794 | OTU794 | d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Rhodobacterales.f__Rhodobacteraceae.gMarivita. |
| 803 | OTU803 | d__Archaea.p__Aenigmarchaeota.c__Aenigmarchaeia.o__Aenigmarchaeales.f__Aenigmarchaeales.gAenigmarchaeales. |
| 814 | OTU814 | d__Archaea.p__Aenigmarchaeota.c__Aenigmarchaeia.o__Aenigmarchaeales.f__Aenigmarchaeales.g__Aenigmarchaeales.s__uncultured_archaeon |
| 823 | OTU823 | d__Bacteria.p__WS2.c__WS2.o__WS2.f__WS2.g__WS2.s__uncultured_Firmicutes |
| 837 | OTU837 | d__Bacteria.p__Latescibacterota.c__Latescibacteria.o__Latescibacterales.f__Latescibacteraceae.g__Latescibacteraceae.s__bacterium_enrichment |
| 840 | OTU840 | d__Bacteria.p__Chloroflexi.c__Dehalococcoidia.o__SAR202_clade.f__SAR202_clade.gSAR202_clade. |
| 845 | OTU845 | d__Bacteria.p__Dependentiae.c__Babeliae.o__Babeliales.f__Vermiphilaceae.gVermiphilaceae. |
| 897 | OTU897 | d__Bacteria.p__Chloroflexi.c__Dehalococcoidia.o__GIF9.f__AB.539.J10.gAB.539.J10. |
| 905 | OTU905 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Granulosicoccales.f__Granulosicoccaceae.g__Granulosicoccus.s__uncultured_bacterium |
| 908 | OTU908 | d__Bacteria.p__Verrucomicrobiota.c__Verrucomicrobiae.oOpitutales... |
| 933 | OTU933 | d__Bacteria.p__Campilobacterota.c__Campylobacteria.o__Campylobacterales.f__Sulfurimonadaceae.g__Sulfurimonas.s__bacterium_enrichment |
| 937 | OTU937 | d__Bacteria.p__Acidobacteriota.c__Acidobacteriae.... |
| 950 | OTU950 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__CH2b56.f__CH2b56.g__CH2b56.s__uncultured_bacterium |
| 954 | OTU954 | d__Bacteria.p__Planctomycetota.c__vadinHA49.o__vadinHA49.f__vadinHA49.g__vadinHA49.s__uncultured_organism |
| 967 | OTU967 | d__Bacteria.p__Planctomycetota.c__Pla3_lineage.o__Pla3_lineage.f__Pla3_lineage.g__Pla3_lineage.s__uncultured_planctomycete |
| 983 | OTU983 | d__Bacteria.p__Desulfobacterota.c__Desulfobulbia.o__Desulfobulbales.f__Desulfobulbaceae.g__uncultured.s__uncultured_gamma |
| 987 | OTU987 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfobacteraceae.g__uncultured.s__uncultured_organism |
| 988 | OTU988 | d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Sphingomonadales.f__Sphingomonadaceae.g__Erythrobacter.s__Erythrobacter_longus |
| 989 | OTU989 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Chitinophagales.f__Saprospiraceae.g__Lewinella.s__uncultured_organism |
| 991 | OTU991 | d__Bacteria.p__Acetothermia.c__Acetothermiia.o__Acetothermiia.f__Acetothermiia.g__Acetothermiia.s__uncultured_bacterium |
| 993 | OTU993 | d__Bacteria.p__Sumerlaeota.c__Sumerlaeia.o__Sumerlaeales.f__Sumerlaeaceae.g__Sumerlaea.s__uncultured_organism |
| 998 | OTU998 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Flavobacteriales.f__Flavobacteriaceae.g__Flavobacteriaceae.s__Aureicoccus_marinus |
| 1000 | OTU1000 | d__Bacteria.p__Cyanobacteria.c__Cyanobacteriia.o__Synechococcales.f__Cyanobiaceae.gCyanobium_PCC.6307. |
| 1001 | OTU1001 | d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Paracaedibacterales.f__Paracaedibacteraceae.g__Candidatus_Captivus.s__uncultured_bacterium |
| 1003 | OTU1003 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Sphingobacteriales.f__NS11.12_marine_group.g__NS11.12_marine_group.s__uncultured_marine |
| 1013 | OTU1013 | d__Archaea.p__Nanoarchaeota.c__Nanoarchaeia.o__Woesearchaeales.f__GW2011_GWC1_47_15.gGW2011_GWC1_47_15. |
| 1014 | OTU1014 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.oCellvibrionales... |
| 1041 | OTU1041 | d__Bacteria.p__Verrucomicrobiota.c__Verrucomicrobiae.o__Verrucomicrobiales.f__Rubritaleaceae.g__uncultured.s__uncultured_bacterium |
| 1050 | OTU1050 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Chitinophagales.f__uncultured.g__uncultured.s__uncultured_bacterium |
| 1057 | OTU1057 | d__Bacteria.p__Dependentiae.c__Babeliae.o__Babeliales.f__Babeliales.g__Babeliales.s__uncultured_organism |
| 1064 | OTU1064 | d__Bacteria.p__Dependentiae.c__Babeliae.o__Babeliales.f__Vermiphilaceae.g__Vermiphilaceae.s__uncultured_delta |
| 1070 | OTU1070 | d__Bacteria.p__Cyanobacteria.c__Vampirivibrionia.o__Gastranaerophilales.f__Gastranaerophilales.gGastranaerophilales. |
| 1086 | OTU1086 | d__Bacteria.p__Planctomycetota.c__Planctomycetes.o__Pirellulales.f__Pirellulaceae.gRubripirellula. |
| 1097 | OTU1097 | d__Bacteria.p__Latescibacterota.c__Latescibacterota.o__Latescibacterota.f__Latescibacterota.g__Latescibacterota.s__unidentified_bacterium |
| 1100 | OTU1100 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Cytophagales.f__Cyclobacteriaceae.g__uncultured.s__uncultured_soil |
| 1108 | OTU1108 | d__Bacteria.p__NB1.j.c__NB1.j.o__NB1.j.f__NB1.j.g__NB1.j.s__uncultured_prokaryote |
| 1114 | OTU1114 | d__Bacteria.p__Verrucomicrobiota.c__Lentisphaeria.o__SS1.B.02.17.f__SS1.B.02.17.g__SS1.B.02.17.s__uncultured_Lentisphaerae |
| 1121 | OTU1121 | d__Bacteria.p__Myxococcota.c__Polyangia.o__Polyangiales.f__Eel.36e1D6.g__Eel.36e1D6.s__uncultured_Polyangiaceae |
| 1127 | OTU1127 | d__Bacteria.p__Actinobacteriota.c__WCHB1.81.o__WCHB1.81.f__WCHB1.81.g__WCHB1.81.s__uncultured_actinobacterium |
| 1129 | OTU1129 | d__Archaea.p__Thermoplasmatota.c__Thermoplasmata.o__Marine_Benthic_Group_D_and_DHVEG.1.f__Marine_Benthic_Group_D_and_DHVEG.1.g__Marine_Benthic_Group_D_and_DHVEG.1.s__archaeon_enrichment |
| 1142 | OTU1142 | d__Bacteria.p__Campilobacterota.c__Campylobacteria.o__Campylobacterales.f__Sulfurimonadaceae.. |
| 1150 | OTU1150 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Chromatiales.f__Chromatiaceae.g__Candidatus_Thiobios.s__uncultured_gamma |
| 1160 | OTU1160 | d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Caulobacterales.f__Parvularculaceae.gParvularcula. |
| 1162 | OTU1162 | d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Rhodobacterales.f__Rhodobacteraceae.gRoseobacter. |
| 1173 | OTU1173 | d__Bacteria.p__Actinobacteriota.c__Acidimicrobiia.o__Actinomarinales.f__uncultured.g__uncultured.s__bacterium_WH6.7 |
| 1178 | OTU1178 | d__Bacteria.p__Firmicutes.c__Bacilli.o__Izemoplasmatales.f__Izemoplasmataceae.g__Izimaplasma.s__Candidatus_Izimaplasma |
| 1189 | OTU1189 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfobacteraceae.g__Desulfospira.s__uncultured_bacterium |
| 1194 | OTU1194 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Oceanospirillales.f__Saccharospirillaceae.gReinekea. |
| 1196 | OTU1196 | d__Bacteria.p__CK.2C2.2.c__CK.2C2.2.o__CK.2C2.2.f__CK.2C2.2.g__CK.2C2.2.s__uncultured_organism |
| 1198 | OTU1198 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Flavobacteriales.f__Cryomorphaceae.g__Vicingus.s__uncultured_sediment |
| 1205 | OTU1205 | d__Bacteria.p__Chloroflexi.c__Chloroflexia.oThermomicrobiales... |
| 1226 | OTU1226 | d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Rhizobiales.f__Hyphomicrobiaceae.gFilomicrobium. |
| 1228 | OTU1228 | d__Bacteria.p__Planctomycetota.c__Planctomycetes.o__Pirellulales.f__Pirellulaceae.g__Rhodopirellula.s__uncultured_hydrocarbon |
| 1235 | OTU1235 | d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Micavibrionales.f__uncultured.g__uncultured.s__uncultured_organism |
| 1238 | OTU1238 | d__Bacteria.p__Acidobacteriota.c__Thermoanaerobaculia.o__Thermoanaerobaculales.f__Thermoanaerobaculaceae.g__Subgroup_23.s__uncultured_prokaryote |
| 1240 | OTU1240 | d__Bacteria.p__Myxococcota.c__Polyangia.o__VHS.B4.70.f__VHS.B4.70.g__VHS.B4.70.s__uncultured_delta |
| 1248 | OTU1248 | d__Bacteria.p__Planctomycetota.c__Phycisphaerae.o__mle1.8.f__mle1.8.g__mle1.8.s__uncultured_organism |
| 1249 | OTU1249 | d__Bacteria.p__Myxococcota.c__Polyangia.o__Polyangiales.f__Sandaracinaceae.gSandaracinus. |
| 1250 | OTU1250 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Nitrosococcales.f__Nitrosococcaceae.gCI75cm.2.12. |
| 1284 | OTU1284 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Steroidobacterales.f__Woeseiaceae.g__JTB255_marine_benthic_group.s__uncultured_organism |
| 1285 | OTU1285 | d__Bacteria.p__Fibrobacterota.c__Fibrobacteria.o__Fibrobacterales.f__TG3.gTG3. |
| 1287 | OTU1287 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__OM182_clade.f__OM182_clade.g__OM182_clade.s__unidentified_marine |
| 1289 | OTU1289 | d__Bacteria.p__Verrucomicrobiota.c__Lentisphaeria.o__SS1.B.02.17.f__SS1.B.02.17.gSS1.B.02.17. |
| 1298 | OTU1298 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Chitinophagales.f__Saprospiraceae.g__uncultured.s__uncultured_Bacteroidetes |
| 1299 | OTU1299 | d__Bacteria.p__Verrucomicrobiota.c__Kiritimatiellae.o__Kiritimatiellales.f__Kiritimatiellaceae.g__R76.B128.s__Kiritimatiella_sp. |
| 1301 | OTU1301 | d__Bacteria.p__Fusobacteriota.c__Fusobacteriia.o__Fusobacteriales.f__Fusobacteriaceae.gFusobacterium. |
| 1302 | OTU1302 | d__Archaea.p__Asgardarchaeota.c__Lokiarchaeia.o__Lokiarchaeia.f__Lokiarchaeia.g__Lokiarchaeia.s__uncultured_crenarchaeote |
| 1303 | OTU1303 | d__Bacteria.p__Desulfobacterota.c__Desulfuromonadia.o__uncultured.f__uncultured.g__uncultured.s__uncultured_bacterium |
| 1308 | OTU1308 | d__Bacteria.p__Planctomycetota.c__Phycisphaerae.o__mle1.8.f__mle1.8.gmle1.8. |
| 1310 | OTU1310 | d__Bacteria.p__Spirochaetota.c__MVP.15.o__MVP.15.f__MVP.15.g__MVP.15.s__uncultured_bacterium |
| 1312 | OTU1312 | d__Bacteria.p__Campilobacterota.c__Campylobacteria.o__Campylobacterales.f__Sulfurospirillaceae.gSulfurospirillum. |
| 1314 | OTU1314 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Nitrosococcales.f__Nitrosococcaceae.g__SZB85.s__uncultured_bacterium |
| 1318 | OTU1318 | d__Archaea.p__Thermoplasmatota.c__Thermoplasmata.o__Methanomassiliicoccales.f__uncultured.g__uncultured.s__uncultured_euryarchaeote |
| 1321 | OTU1321 | d__Bacteria.p__Cyanobacteria.c__Cyanobacteriia.o__Cyanobacteriales.f__Xenococcaceae.g__Chroococcidiopsis_PCC.6712.s__uncultured_cyanobacterium |
| 1325 | OTU1325 | d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__uncultured.f__uncultured.guncultured. |
| 1329 | OTU1329 | d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfococcaceae.gDesulfonema. |
| 1330 | OTU1330 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Thiotrichales.f__Thiotrichaceae.. |
| 1338 | OTU1338 | d__Bacteria.p__Myxococcota.c__Polyangia.o__Blfdi19.f__Blfdi19.g__Blfdi19.s__uncultured_bacterium |
| 1343 | OTU1343 | d__Bacteria.p__Acidobacteriota.c__Thermoanaerobaculia.o__Thermoanaerobaculales.f__Thermoanaerobaculaceae.g__B276.D12.s__uncultured_bacterium |
| 1347 | OTU1347 | d__Bacteria.p__Fibrobacterota.c__Chitinivibrionia.o__uncultured.f__uncultured.guncultured. |
| 1350 | OTU1350 | d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Flavobacteriales.f__Flavobacteriaceae.gPolaribacter. |
| 1351 | OTU1351 | d__Bacteria.p__Planctomycetota.c__Phycisphaerae.o__Phycisphaerales.f__AKAU3564_sediment_group.g__AKAU3564_sediment_group.s__uncultured_bacterium |
| 1352 | OTU1352 | d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__D90.f__D90.g__D90.s__uncultured_gamma |
| 1353 | OTU1353 | d__Bacteria.p__Planctomycetota.c__Phycisphaerae.o__Amsterdam.1B.07.f__Amsterdam.1B.07.g__Amsterdam.1B.07.s__uncultured_bacterium |
| 1359 | OTU1359 | d__Bacteria.p__Patescibacteria.c__Parcubacteria.o__Candidatus_Kaiserbacteria.f__Candidatus_Kaiserbacteria.g__Candidatus_Kaiserbacteria.s__bacterium_SH4.10 |
| 1361 | OTU1361 | d__Bacteria.p__Chloroflexi.c__Anaerolineae.o__Anaerolineales.f__Anaerolineaceae.gPelolinea. |
| 1362 | OTU1362 | d__Bacteria.p__Gemmatimonadota.c__BD2.11_terrestrial_group.o__BD2.11_terrestrial_group.f__BD2.11_terrestrial_group.g__BD2.11_terrestrial_group.s__unidentified_bacterium |
| 1363 | OTU1363 | d__Bacteria.p__Sumerlaeota.c__Sumerlaeia.o__uncultured.f__uncultured.g__uncultured.s__uncultured_organism |
| 1369 | OTU1369 | d__Bacteria.p__Planctomycetota.c__Phycisphaerae.o__Phycisphaerales.f__Phycisphaeraceae.g__uncultured.s__uncultured_bacterium |
| 1370 | OTU1370 | d__Bacteria.p__Latescibacterota.c__Latescibacterota.o__Latescibacterota.f__Latescibacterota.g__Latescibacterota.s__uncultured_Acidobacterium |
| 1374 | OTU1374 | d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Defluviicoccales.f__Defluviicoccaceae.g__Defluviicoccus.s__uncultured_Rhodospirillaceae |
The separation between groups is 3.12.
We update the plot with the separations:
intersección=c(dim(Significant.capat)[1],Sep(HC))
Resultados=rbind(Resultados,intersección)
row.names(Resultados)=c(
"LR.max",
"LR.max.local",
"Aldex",
"glmnet",
"glmnet.ext",
"intersection")
colnames(Resultados)=c("Size","Separation")
par(opar)
fit = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),3],cv=TRUE)
fit.low = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),4],cv=TRUE)
fit.up = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),5],cv=TRUE)
plot(F.SS[,c(1,3)],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Separation",main="Average separation for perfect classifiers",ylim=c(0,2.25),xlim=c(0,250),xaxp=c(0,250,10),yaxp=c(0,2.5,10))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)
points(Resultados,col="blue",type="h")
points(Resultados,col="blue",pch=20)
text(Resultados[,1],Resultados[,2]+0.02,col="blue",labels=row.names(Resultados),cex=0.75)
abline(h=2.47,col="green")
text(20,2.6,col="green",labels="Global",cex=0.75)
Resultados %>%
kbl() %>%
kable_styling()
| Size | Separation | |
|---|---|---|
| LR.max | 549 | 2.829712 |
| LR.max.local | 46 | 4.655234 |
| Aldex | 121 | 3.429094 |
| glmnet | 350 | 2.776603 |
| glmnet.ext | 104 | 4.421816 |
| intersection | 312 | 3.116542 |
tax_otu=as.data.frame(read_excel("TablaOTUs.xlsx"))[1:12,]
Samples=tax_otu$ID
taxa=colnames(tax_otu[,3:dim(tax_otu)[2]])
OTUs=paste("OTU",1:(dim(tax_otu)[2]-2),sep="")
DF.0=tax_otu[,-c(1,2)]
row.names(DF.0)=Samples
colnames(DF.0)=OTUs
taxa.original=taxa
DF.0.original=DF.0
n.OTUs.original=dim(DF.0)[2]
Miseria=as.vector(which(colSums(DF.0)<=2))
DF.0=DF.0[,-Miseria]
taxa=taxa[-Miseria]
OTUs=OTUs[-Miseria]
hit=function(x){min(c(x,1))}
DF.0.hits=as.matrix(DF.0)
DF.0.hits[]=vapply(DF.0.hits, hit, numeric(1))
Unics=data.frame(
Sample=rep(NA,length(which(colSums(DF.0.hits)==1))), OTUs=names(colSums(DF.0)[which(colSums(DF.0.hits)==1)]), Reads=colSums(DF.0)[which(colSums(DF.0.hits)==1)],
Proportions=rep(0,length(which(colSums(DF.0.hits)==1)))
)
for (i in 1:length(Unics$OTUs)){
ii=which(DF.0.hits[,Unics$OTUs[i]]==1)
Unics$Proportions[i]=DF.0[ii,Unics$OTUs[i]]/sum(DF.0[ii,])
Unics$Sample[i]=Samples[ii]}
Unics.Freqs=Unics[Unics$Proportions>=0.05/100,]
row.names(Unics.Freqs)=NULL
DF.0=DF.0[,-which(colSums(DF.0.hits)==1)]
taxa=taxa[-which(colSums(DF.0.hits)==1)]
OTUs=OTUs[-which(colSums(DF.0.hits)==1)]
row.names(DF.0)=Samples
DF=zCompositions::cmultRepl(DF.0,method="GBM",output="p-counts",suppress.print=TRUE,z.warning=0.99)
DF=DF[7:12,]
DF.0=DF.0[7:12,]
DF.0.original=DF.0.original[7:12 ,]
Type=as.factor(c(rep("CAU_T2C", 3),rep("CAU_T2HW", 3)))
colors=c("purple","red")[Type]
After removing OTUs with 2 or fewer reads across the CAU samples and those appearing in only one of these samples, 1682 taxa remain.
The OTUs removed at this step that represent more than 0.05% of the sample in which they appear are:
Unics.Freqs[rev(order(Unics.Freqs$Proportions)),] %>%
kbl() %>%
kable_styling()
| Sample | OTUs | Reads | Proportions | |
|---|---|---|---|---|
| 2 | CAU_T0_2 | OTU966 | 55 | 0.0012193 |
| 1 | CAU_T2HW_1 | OTU727 | 55 | 0.0008798 |
rm(tax_otu)
rm(DF.0.hits)
rm(Miseria)
rm(Unics.Freqs)
HC=ClusterHC(DF,Grups=Type,colores=colors)
base=Sep(HC)
The two sample types are NOT well separated.
We examine the proportion of OTU subsets of reasonable size that already classify them correctly.
# Fijo una semilla de aleatoriedad para el experimento
> initial_seed=as.integer(Sys.time())
> print (initial_seed)
# [1] 1715851715853234
initial_seed %% 10000
# [1] 3234
set.seed(3234)
Experimento=function(i)
{
ii=sample(dim(DF)[2],i,rep=FALSE)
DFtemp=DF[,ii]
CC=ClusterHC(DFtemp,dendrograma=FALSE,barplot=FALSE, Grups=Type,colores=colors)
c(Encerts(CC$tabla,table(Type)),Sep(CC))
}
n=500
F.SS=c()
for (i in 10:1000)
{
print(i)
EE=replicate(n,Experimento(i))
EE.2=EE[2,which(EE[1,]==0)]
if(length(which(EE[1,]==0))>0){
XX=replicate(1000,mean(sample(EE.2,n,replace=TRUE)))
F.SS=rbind(F.SS, c(i,
length(EE.2)/250,
mean(EE.2),
quantile(XX,0.025),
quantile(XX,0.975)
))}
else {
F.SS=rbind(F.SS, c(i,
0,
NA,
NA,
NA))
}
}
colnames(F.SS)=c("n","Proportion","Avg. sep.","Q_0.025","Q_0.975")
saveRDS(F.SS, file="Fraccio.Separadors.CAU.T2CvsT2HW.RData")
F.SS=readRDS("Fraccio.Separadors.CAU.T2CvsT2HW.RData")
The first plot shows, for each \(n\) up to 600, the proportion of samples of size \(n\) that perfectly classify CAU_T2C and CAU_T2HW (black curve), along with the 95% Clopper–Pearson confidence interval for that proportion (red curves). We used smoothing splines to smooth the curves.
F.SS_prop.Q_0.025=epitools::binom.exact(F.SS[,2]*250,250)$lower
F.SS_prop.Q_0.975=epitools::binom.exact(F.SS[,2]*250,250)$upper
fit = smooth.spline(F.SS[,1],F.SS[,2],cv=TRUE)
fit.low = smooth.spline(F.SS[,1],F.SS_prop.Q_0.025,cv=TRUE)
fit.up = smooth.spline(F.SS[,1],F.SS_prop.Q_0.975,cv=TRUE)
plot(F.SS[,1:2],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Proportion",main="Proportion of perfect classifiers",ylim=c(0,0.1),xlim=c(0,600),xaxp=c(0,600,12),yaxp=c(0,0.1,10))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)
The second plot shows, for each \(n\) up to 600, the estimated mean separation obtained with a sample of size \(n\) that perfectly classifies CAU_T2C and CAU_T2HW (black curve), along with the 95% confidence interval of this mean separation obtained via bootstrap (red curve). Note that from \(n=350\) onward, NO subset is found that separates the two groups.
fit = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),3],cv=TRUE)
fit.low = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),4],cv=TRUE)
fit.up = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),5],cv=TRUE)
plot(F.SS[,c(1,3)],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Separation",main="Average separation for perfect classifiers",xlim=c(0,600),xaxp=c(0,600,12),yaxp=c(0,1.5,15))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)
`
LRS=explore_logratios(DF,Type)
saveRDS(LRS, file="LRS_CAU_2C2HW.RData")
LRS=readRDS("LRS_CAU_2C2HW.RData")
assoc=rep(0,dim(DF)[2]-4)
for (m in 5:dim(DF)[2]){
assoc[m-4]=sum(LRS$`association log-ratio with y`[1:m,1:m])/m^2
}
saveRDS(assoc, file="assoc_CAU_2C2HW.RData")
assoc=readRDS("assoc_CAU_2C2HW.RData")
plot(5:dim(DF)[2],assoc,type="b",xlab="m",ylab="Average goodness of the m most relevant." ,pch=20,cex=0.5)
m=4+which.max(assoc)
impLR=sort(as.numeric(LRS$`order of importance`[1:m]))
HC=ClusterHC(DF[,impLR],barplot=FALSE,Grups=Type,colores=colors)
LR.max=c(length(impLR),Sep(HC))
The maximum is attained at the set of 95 most logratio-relevant taxa.
They perfectly separate the two sample types. The separation between groups is 1.98.
Significant=data.frame(
OTUs=paste("OTU",1:n.OTUs.original,sep=""),
taxa=taxa.original,
Log_ratio.max=Indicador(impLR))
coda_glmnet)coda.glmnet=coda_glmnet(DF,Type,showPlots = FALSE)
## [1] "No variables are selected. The prediction and the signature plots are not displayed."
coda.glmnet$`signature plot`
## NULL
coda_glmnet does not find taxa with significant differences, likely due to the low power with only 3 vs. 3 samples. When this happens, we add random replicates, with the idea that stochastic resonance may strengthen the signal. We test with 3, 5, and 10 replicates.
> initial_seed=as.integer(Sys.time())
> print (initial_seed)
# [1] 1711435693
initial_seed %% 10000
# [1] 5693
# 3 replicates
set.seed(5693)
M=3
repls=aldexCesc.clr(t(DF), conds=Type, mc.samples=M)@analysisData
## [1] "operating in serial mode"
repls=t(cbind(repls$CAU_T2C_1,repls$CAU_T2C_2,repls$CAU_T2C_3,repls$CAU_T2HW_1,repls$CAU_T2HW_2,repls$CAU_T2HW_3))
attr(repls,"dimnames")[[2]]=NULL
colnames(repls)=colnames(DF)
row.names(repls)=c(paste("CAU_T2C_1",1:M,sep="_"),
paste("CAU_T2C_2",1:M,sep="_"),
paste("CAU_T2C_3",1:M,sep="_"),
paste("CAU_T2HW_1",1:M,sep="_"),
paste("CAU_T2HW_2",1:M,sep="_"),
paste("CAU_T2HW_3",1:M,sep="_"))
mostres=rbind(repls,easyCODA::CLR(DF)$LR)
rm(repls)
Type_ext=as.factor(c(rep("CAU_T2C", 3*M),rep("CAU_T2HW", 3*M),rep("CAU_T2C", 3),rep("CAU_T2HW", 3)))
colors_ext=c("purple","red")[Type_ext]
LogRat_mostres=logratios_matrix_clr(mostres)
saveRDS(LogRat_mostres, file="LogRat_mostres_CAU_2C2HW_3copies.RData")
LogRat_mostres=readRDS("LogRat_mostres_CAU_2C2HW_3copies.RData")
LogRat_mostres=list(LogRat_mostres[[1]],LogRat_mostres[[2]])
coda.glmnet.3=coda_glmnet_lr_bin(LogRat_mostres,Type_ext,taxa,showPlots = FALSE)
impGLMNET_ext.3=coda.glmnet.3$taxa.num
HC=ClusterHC(DF[,impGLMNET_ext.3],dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)
glmnet.ext.3=c(length(impGLMNET_ext.3),Sep(HC))
With 3 replicates, 42 significant taxa are found, perfectly separating the two groups, with a separation of 1.49.
# 5 replicates
set.seed(5693)
M=5
repls=aldexCesc.clr(t(DF), conds=Type, mc.samples=M)@analysisData
## [1] "operating in serial mode"
repls=t(cbind(repls$CAU_T2C_1,repls$CAU_T2C_2,repls$CAU_T2C_3,repls$CAU_T2HW_1,repls$CAU_T2HW_2,repls$CAU_T2HW_3))
attr(repls,"dimnames")[[2]]=NULL
colnames(repls)=colnames(DF)
row.names(repls)=c(paste("CAU_T2C_1",1:M,sep="_"),
paste("CAU_T2C_2",1:M,sep="_"),
paste("CAU_T2C_3",1:M,sep="_"),
paste("CAU_T2HW_1",1:M,sep="_"),
paste("CAU_T2HW_2",1:M,sep="_"),
paste("CAU_T2HW_3",1:M,sep="_"))
mostres=rbind(repls,easyCODA::CLR(DF)$LR)
rm(repls)
Type_ext=as.factor(c(rep("CAU_T2C", 3*M),rep("CAU_T2HW", 3*M),rep("CAU_T2C", 3),rep("CAU_T2HW", 3)))
colors_ext=c("purple","red")[Type_ext]
LogRat_mostres=logratios_matrix_clr(mostres)
saveRDS(LogRat_mostres, file="LogRat_mostres_CAU_2C2HW_5copies.RData")
LogRat_mostres=readRDS("LogRat_mostres_CAU_2C2HW_5copies.RData")
LogRat_mostres=list(LogRat_mostres[[1]],LogRat_mostres[[2]])
coda.glmnet.5=coda_glmnet_lr_bin(LogRat_mostres,Type_ext,taxa,showPlots = FALSE)
impGLMNET_ext.5=coda.glmnet.5$taxa.num
HC=ClusterHC(DF[,impGLMNET_ext.5],dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)
glmnet.ext.5=c(length(impGLMNET_ext.5),Sep(HC))
With 5 replicates, 53 significant taxa are found, perfectly separating the two groups, with a separation of 1.2.
Significant=cbind(Significant,
glmnet_sign.ext.5.T2C=Indicador(coda.glmnet.5$taxa.num[coda.glmnet.5$`log-contrast coefficients`>0]),
glmnet_sign.ext.5.T2HW=Indicador(coda.glmnet.5$taxa.num[coda.glmnet.5$`log-contrast coefficients`<0]))
set.seed(5693)
M=10
repls=aldexCesc.clr(t(DF), conds=Type, mc.samples=M)@analysisData
## [1] "operating in serial mode"
repls=t(cbind(repls$CAU_T2C_1,repls$CAU_T2C_2,repls$CAU_T2C_3,repls$CAU_T2HW_1,repls$CAU_T2HW_2,repls$CAU_T2HW_3))
attr(repls,"dimnames")[[2]]=NULL
colnames(repls)=colnames(DF)
row.names(repls)=c(paste("CAU_T2C_1",1:M,sep="_"),
paste("CAU_T2C_2",1:M,sep="_"),
paste("CAU_T2C_3",1:M,sep="_"),
paste("CAU_T2HW_1",1:M,sep="_"),
paste("CAU_T2HW_2",1:M,sep="_"),
paste("CAU_T2HW_3",1:M,sep="_"))
mostres=rbind(repls,easyCODA::CLR(DF)$LR)
rm(repls)
Type_ext=as.factor(c(rep("CAU_T2C", 3*M),rep("CAU_T2HW", 3*M),rep("CAU_T2C", 3),rep("CAU_T2HW", 3)))
colors_ext=c("purple","red")[Type_ext]
LogRat_mostres=logratios_matrix_clr(mostres)
saveRDS(LogRat_mostres, file="LogRat_mostres_CAU_2C2HW_10copies.RData")
LogRat_mostres=readRDS("LogRat_mostres_CAU_2C2HW_10copies.RData")
LogRat_mostres=list(LogRat_mostres[[1]],LogRat_mostres[[2]])
coda.glmnet.10=coda_glmnet_lr_bin(LogRat_mostres,Type_ext,taxa,showPlots = FALSE)
impGLMNET_ext.10=coda.glmnet.10$taxa.num
HC=ClusterHC(DF[,impGLMNET_ext.10],dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)
glmnet.ext.10=c(length(impGLMNET_ext.10),Sep(HC))
With 10 replicates, 59 significant taxa are found, perfectly separating the two groups, with a separation of 1.12.
Significant=cbind(Significant,
glmnet_sign.ext.10.T2C=Indicador(coda.glmnet.10$taxa.num[coda.glmnet.10$`log-contrast coefficients`>0]),
glmnet_sign.ext.10.T2HW=Indicador(coda.glmnet.10$taxa.num[coda.glmnet.10$`log-contrast coefficients`<0]))
> initial_seed=as.integer(Sys.time())
> print (initial_seed)
# [1] 1711461735
initial_seed %% 10000
# [1] 1735
set.seed(1735)
repl_kw=aldexCesc.clr(t(DF), conds=Type, mc.samples=1000, verbose=FALSE)
aldex_kw=aldex.kw(repl_kw, verbose=FALSE)
#
saveRDS(repl_kw, file="repl_kw_CAU_2C2HW.RData")
saveRDS(aldex_kw, file="aldex_kw_CAU_2C2HW.RData")
p.valores_kw=readRDS("aldex_kw_CAU_2C2HW.RData")
min(p.valores_kw[,1])
## [1] 0.04999719
min(p.valores_kw[,2])
## [1] 0.3641574
min(p.valores_kw[,3])
## [1] 0.002010094
min(p.valores_kw[,4])
## [1] 0.07355742
length(p.valores_kw[p.valores_kw[,3]<0.01,2])
## [1] 11
length(p.valores_kw[p.valores_kw[,3]<0.05,2])
## [1] 68
There are no taxa with even an unadjusted p-value < 0.05.
DF.prop=DF.0.original/rowSums(DF.0.original)
SMP.prop=simper(DF.prop,group=Type)
indexos_SMP.07=SMP.prop$CAU_T2C_CAU_T2HW$ord[which(SMP.prop$CAU_T2C_CAU_T2HW$cusum<=0.7)]
indexos_SMP.05=SMP.prop$CAU_T2C_CAU_T2HW$ord[which(SMP.prop$CAU_T2C_CAU_T2HW$cusum<=0.5)]
OTUs_SMP.07=sort(attr(SMP.prop$CAU_T2C_CAU_T2HW$cusum[which(SMP.prop$CAU_T2C_CAU_T2HW$cusum<=0.7)],"names"))
OTUs_SMP.good.07=setdiff(OTUs_SMP.07,Unics$OTUs)
simper identifies 170 significant taxa, but they do not separate the samples well.
HC=ClusterHC(DF[,OTUs_SMP.good.07],dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)
Significant=cbind(Significant,
t(DF.prop))
write.csv2(Significant,"OTU_Significativos_CAU_T2.csv",row.names=FALSE)
Resultados=rbind(
LR.max,
glmnet.ext.3,
glmnet.ext.5,
glmnet.ext.10
)
row.names(Resultados)=c(
"LR",
"glmnet.ext.3",
"glmnet.ext.5",
"glmnet.ext.10")
colnames(Resultados)=c("Size","Separation")
fit = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),3],cv=TRUE)
fit.low = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),4],cv=TRUE)
fit.up = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),5],cv=TRUE)
plot(F.SS[,c(1,3)],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Separation",main="Average separation for perfect classifiers",ylim=c(0,2.25),xlim=c(0,250),xaxp=c(0,250,10),yaxp=c(0,2.5,10))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)
points(Resultados,col="blue",type="h")
points(Resultados,col="blue",pch=20)
text(Resultados[,1],Resultados[,2]+0.02,col="blue",labels=row.names(Resultados),cex=0.75)
Finally, we intersect the OTUs identified as significant according to LR and coda_glmnet with 3 replicates
Significant.capat=Significant[,c(1,2,3,4,5)]
Significant.capat=cbind(Significant.capat,Cuánto=rowSums(Significant.capat[,3:dim(Significant.capat)[2]]))
Significant.capat=Significant.capat[Significant.capat$Cuánto==2,]
DF.Imp=DF[,Significant.capat[,1]]
HC=ClusterHC(DF.Imp,dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)
There are 10 taxa. They are:
Significant.capat[,1:2]%>%
kbl() %>%
kable_styling()
| OTUs | taxa | |
|---|---|---|
| OTU38 | OTU38 | Unassigned;;;;;; |
| OTU108 | OTU108 | d__Bacteria;p__Spirochaetota;c__Spirochaetia;o__Spirochaetales;f__Spirochaetaceae;gGWE2-31-10; |
| OTU199 | OTU199 | d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Oceanospirillales;f__Halomonadaceae;gCobetia; |
| OTU208 | OTU208 | d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Flavobacteriales;f__Flavobacteriaceae;gLutibacter; |
| OTU337 | OTU337 | d__Bacteria;p__Chloroflexi;c__Anaerolineae;o__SJA-15;f__SJA-15;g__SJA-15;s__uncultured_Chloroflexi |
| OTU365 | OTU365 | d__Bacteria;p__Firmicutes;c__Clostridia;oLachnospirales;;; |
| OTU383 | OTU383 | d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Alteromonadales;f__Alteromonadaceae;; |
| OTU685 | OTU685 | d__Bacteria;p__PAUC34f;c__PAUC34f;o__PAUC34f;f__PAUC34f;g__PAUC34f;s__uncultured_bacterium |
| OTU708 | OTU708 | d__Bacteria;p__Patescibacteria;c__Gracilibacteria;o__Gracilibacteria;f__Gracilibacteria;g__Gracilibacteria;s__uncultured_organism |
| OTU1120 | OTU1120 | d__Bacteria;p__Calditrichota;c__Calditrichia;o__Calditrichales;f__Calditrichaceae;gCaldithrix; |
The separation between groups is 1.96.
We update the plot with the separations:
intersección=c(dim(Significant.capat)[1],Sep(HC))
Resultados=rbind(Resultados,intersección)
row.names(Resultados)=c(
"LR",
"glmnet.ext.3",
"glmnet.ext.5",
"glmnet.ext.10",
"intersection")
colnames(Resultados)=c("Size","Separation")
par(opar)
fit = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),3],cv=TRUE)
fit.low = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),4],cv=TRUE)
fit.up = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),5],cv=TRUE)
plot(F.SS[,c(1,3)],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Separation",main="Average separation for perfect classifiers",ylim=c(0,2.25),xlim=c(0,250),xaxp=c(0,250,10),yaxp=c(0,2.5,10))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)
points(Resultados,col="blue",type="h")
points(Resultados,col="blue",pch=20)
text(Resultados[,1],Resultados[,2]+0.02,col="blue",labels=row.names(Resultados),cex=0.75)
abline(h=2.47,col="green")
text(20,2.6,col="green",labels="Global",cex=0.75)
Resultados %>%
kbl() %>%
kable_styling()
| Size | Separation | |
|---|---|---|
| LR | 95 | 1.981986 |
| glmnet.ext.3 | 42 | 1.492428 |
| glmnet.ext.5 | 53 | 1.203924 |
| glmnet.ext.10 | 59 | 1.120839 |
| intersection | 10 | 1.960728 |
tax_otu=as.data.frame(read_excel("TablaOTUs.xlsx"))[13:24,]
Samples=tax_otu$ID
taxa=colnames(tax_otu[,3:dim(tax_otu)[2]])
OTUs=paste("OTU",1:(dim(tax_otu)[2]-2),sep="")
DF.0=tax_otu[,-c(1,2)]
row.names(DF.0)=Samples
colnames(DF.0)=OTUs
taxa.original=taxa
DF.0.original=DF.0
n.OTUs.original=dim(DF.0)[2]
indices=1:n.OTUs.original
hit=function(x){min(c(x,1))}
DF.0.hits=as.matrix(DF.0)
DF.0.hits[]=vapply(DF.0.hits, hit, numeric(1))
Unics=data.frame(
Sample=rep(NA,length(which(colSums(DF.0.hits)==1))), OTUs=names(colSums(DF.0)[which(colSums(DF.0.hits)==1)]), Reads=colSums(DF.0)[which(colSums(DF.0.hits)==1)],
Proportions=rep(0,length(which(colSums(DF.0.hits)==1)))
)
for (i in 1:length(Unics$OTUs)){
ii=which(DF.0.hits[,Unics$OTUs[i]]==1)
Unics$Proportions[i]=DF.0[ii,Unics$OTUs[i]]/sum(DF.0[ii,])
Unics$Sample[i]=Samples[ii]}
Unics.Freqs=Unics[Unics$Proportions>=0.05/100,]
row.names(Unics.Freqs)=NULL
Miseria=as.vector(which(colSums(DF.0)<=2|colSums(DF.0.hits)==1))
DF.0=DF.0[,-Miseria]
taxa=taxa[-Miseria]
OTUs=OTUs[-Miseria]
indices=indices[-Miseria]
row.names(DF.0)=Samples
DF=zCompositions::cmultRepl(DF.0,method="GBM",output="p-counts",suppress.print=TRUE,z.warning=0.99)
DF.0=DF.0[7:12,]
DF=DF[7:12,]
DF.0.original=DF.0.original[7:12 ,]
Type=as.factor(c(rep("CYM_T2C", 3),rep("CYM_T2HW", 3)))
colors=c("purple","red")[Type]
After removing OTUs with 2 or fewer reads across the CAU samples and those appearing in only one of these samples, 2247 taxa remain.
The OTUs removed at this step that represent more than 0.05% of the sample in which they appear are:
Unics.Freqs[rev(order(Unics.Freqs$Proportions)),] %>%
kbl() %>%
kable_styling()
| Sample | OTUs | Reads | Proportions |
|---|---|---|---|
| CYM_T2C_1 | OTU2121 | 186 | 0.0020402 |
rm(tax_otu)
rm(DF.0.hits)
rm(Miseria)
rm(Unics.Freqs)
HC=ClusterHC(DF,Grups=Type,colores=colors)
base=Sep(HC)
The two sample types are perfectly separated from the start. The separation between groups is 1.21.
We examine the proportion of OTU subsets of reasonable size that already classify them correctly.
> initial_seed=as.integer(Sys.time())
> print (initial_seed)
# [1] 1714640507
initial_seed %% 10000
# [1] 507
set.seed(507)
Experimento=function(i)
{
ii=sample(dim(DF)[2],i,rep=FALSE)
DFtemp=DF[,ii]
CC=ClusterHC(DFtemp,dendrograma=FALSE,barplot=FALSE, Grups=Type,colores=colors)
c(Encerts(CC$tabla,table(Type)),Sep(CC))
}
n=250
F.SS=c()
for (i in 10:1000)
{
print(i)
EE=replicate(n,Experimento(i))
EE.2=EE[2,which(EE[1,]==0)]
XX=replicate(1000,mean(sample(EE.2,n,replace=TRUE)))
F.SS=rbind(F.SS, c(i,
length(EE.2)/250,
mean(EE.2),
quantile(XX,0.025),
quantile(XX,0.975)
))
}
colnames(F.SS)=c("n","Proportion","Avg. sep.","Q_0.025","Q_0.975")
saveRDS(F.SS, file="Fraccio.Separadors.CYM.T2CvsT2HW.RData")
F.SS=readRDS("Fraccio.Separadors.CYM.T2CvsT2HW.RData")
The first plot shows, for each \(n\) up to 600, the proportion of samples of size \(n\) that perfectly classify CYM_T2C and CYM_T2HW (black curve), along with the 95% Clopper–Pearson confidence interval for that proportion (red curves). We used smoothing splines to smooth the curves.
F.SS_prop.Q_0.025=epitools::binom.exact(F.SS[,2]*250,250)$lower
F.SS_prop.Q_0.975=epitools::binom.exact(F.SS[,2]*250,250)$upper
fit = smooth.spline(F.SS[,1],F.SS[,2],cv=TRUE)
fit.low = smooth.spline(F.SS[,1],F.SS_prop.Q_0.025,cv=TRUE)
fit.up = smooth.spline(F.SS[,1],F.SS_prop.Q_0.975,cv=TRUE)
plot(F.SS[,1:2],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Proportion",main="Proportion of perfect classifiers",xlim=c(0,1000),xaxp=c(0,1000,20),yaxp=c(0,1,10))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)
The second plot shows, for each \(n\) up to 600, the estimated mean separation obtained with a sample of size \(n\) that perfectly classifies CYM_T2C and CYM_T2HW (black curve), along with the 95% confidence interval of this mean separation obtained via bootstrap (red curve).
fit = smooth.spline(F.SS[,1],F.SS[,3],cv=TRUE)
fit.low = smooth.spline(F.SS[,1],F.SS[,4],cv=TRUE)
fit.up = smooth.spline(F.SS[,1],F.SS[,5],cv=TRUE)
plot(F.SS[,c(1,3)],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Separation",main="Average separation for perfect classifiers",xlim=c(0,1000),xaxp=c(0,1000,20),yaxp=c(0,2.5,10))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)
abline(h=base,col="green")
`
LRS=explore_logratios(DF,Type)
saveRDS(LRS, file="LRS_CYM_2C2HW.RData")
LRS=readRDS("LRS_CYM_2C2HW.RData")
assoc=rep(0,dim(DF)[2]-4)
for (m in 5:dim(DF)[2]){
assoc[m-4]=sum(LRS$`association log-ratio with y`[1:m,1:m])/m^2
}
saveRDS(assoc, file="assoc_CYM_2C2HW.RData")
assoc=readRDS("assoc_CYM_2C2HW.RData")
plot(5:dim(DF)[2],assoc,type="b",xlab="m",ylab="Average goodness of the m most relevant." ,pch=20,cex=0.5)
m=4+which.max(assoc)
impLR=sort(as.numeric(LRS$`order of importance`[1:m]))
HC=ClusterHC(DF[,impLR],barplot=FALSE,Grups=Type,colores=colors)
LR.max=c(length(impLR),Sep(HC))
The maximum is attained at the set of 563 most logratio-relevant taxa.
They perfectly separate the two sample types. The separation between groups is 2.15.
Significant=data.frame(
OTUs=paste("OTU",1:n.OTUs.original,sep=""),
taxa=taxa.original,
Log_ratio.max=Indicador(impLR)
)
coda_glmnet)coda.glmnet=coda_glmnet(DF,Type,showPlots = FALSE)
## [1] "No variables are selected. The prediction and the signature plots are not displayed."
coda.glmnet$`signature plot`
## NULL
coda_glmnet does not find taxa with significant differences. We add random replicates. We test with 3, 5, and 10 replicates.
# 3 replicates
set.seed(5693)
M=3
repls=aldexCesc.clr(t(DF), conds=Type, mc.samples=M)@analysisData
## [1] "operating in serial mode"
repls=t(cbind(repls$CYM_T2C_1,repls$CYM_T2C_2,repls$CYM_T2C_3,repls$CYM_T2HW_1,repls$CYM_T2HW_2,repls$CYM_T2HW_3))
attr(repls,"dimnames")[[2]]=NULL
colnames(repls)=colnames(DF)
row.names(repls)=c(paste("CYM_T2C_1",1:M,sep="_"),
paste("CYM_T2C_2",1:M,sep="_"),
paste("CYM_T2C_3",1:M,sep="_"),
paste("CYM_T2HW_1",1:M,sep="_"),
paste("CYM_T2HW_2",1:M,sep="_"),
paste("CYM_T2HW_3",1:M,sep="_"))
mostres=rbind(repls,easyCODA::CLR(DF)$LR)
rm(repls)
Type_ext=as.factor(c(rep("CYM_T2C", 3*M),rep("CYM_T2HW", 3*M),rep("CYM_T2C", 3),rep("CYM_T2HW", 3)))
colors_ext=c("purple","red")[Type_ext]
LogRat_mostres=logratios_matrix_clr(mostres)
saveRDS(LogRat_mostres, file="LogRat_mostres_CYM_2C2HW_3copies.RData")
LogRat_mostres=readRDS("LogRat_mostres_CYM_2C2HW_3copies.RData")
LogRat_mostres=list(LogRat_mostres[[1]],LogRat_mostres[[2]])
coda.glmnet.3=coda_glmnet_lr_bin(LogRat_mostres,Type_ext,taxa,showPlots = FALSE)
impGLMNET_ext.3=coda.glmnet.3$taxa.num
HC=ClusterHC(DF[,impGLMNET_ext.3],dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)
glmnet.ext.3=c(length(impGLMNET_ext.3),Sep(HC))
With 3 replicates, 81 significant taxa are found, perfectly separating the two groups, with a separation of 3.67.
Significant=cbind(Significant,
glmnet_sign.ext.3.T2C=Indicador(coda.glmnet.3$taxa.num[coda.glmnet.3$`log-contrast coefficients`>0]),
glmnet_sign.ext.3.T2HW=Indicador(coda.glmnet.3$taxa.num[coda.glmnet.3$`log-contrast coefficients`<0]))
# 5 replicates
set.seed(5693)
M=5
repls=aldexCesc.clr(t(DF), conds=Type, mc.samples=M)@analysisData
## [1] "operating in serial mode"
repls=t(cbind(repls$CYM_T2C_1,repls$CYM_T2C_2,repls$CYM_T2C_3,repls$CYM_T2HW_1,repls$CYM_T2HW_2,repls$CYM_T2HW_3))
attr(repls,"dimnames")[[2]]=NULL
colnames(repls)=colnames(DF)
row.names(repls)=c(paste("CYM_T2C_1",1:M,sep="_"),
paste("CYM_T2C_2",1:M,sep="_"),
paste("CYM_T2C_3",1:M,sep="_"),
paste("CYM_T2HW_1",1:M,sep="_"),
paste("CYM_T2HW_2",1:M,sep="_"),
paste("CYM_T2HW_3",1:M,sep="_"))
mostres=rbind(repls,easyCODA::CLR(DF)$LR)
rm(repls)
Type_ext=as.factor(c(rep("CYM_T2C", 3*M),rep("CYM_T2HW", 3*M),rep("CYM_T2C", 3),rep("CYM_T2HW", 3)))
colors_ext=c("purple","red")[Type_ext]
LogRat_mostres=logratios_matrix_clr(mostres)
saveRDS(LogRat_mostres, file="LogRat_mostres_CYM_2C2HW_5copies.RData")
LogRat_mostres=readRDS("LogRat_mostres_CYM_2C2HW_5copies.RData")
LogRat_mostres[[1]]=LogRat_mostres[[1]][c(1:30,37:42),]
LogRat_mostres=list(LogRat_mostres[[1]],LogRat_mostres[[2]])
coda.glmnet.5=coda_glmnet_lr_bin(LogRat_mostres,Type_ext,taxa,showPlots = FALSE)
impGLMNET_ext.5=coda.glmnet.5$taxa.num
HC=ClusterHC(DF[,impGLMNET_ext.5],dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)
glmnet.ext.5=c(length(impGLMNET_ext.5),Sep(HC))
With 5 replicates, 84 significant taxa are found, perfectly separating the two groups, with a separation of 3.4.
Significant=cbind(Significant,
glmnet_sign.ext.5.T2C=Indicador(coda.glmnet.5$taxa.num[coda.glmnet.5$`log-contrast coefficients`>0]),
glmnet_sign.ext.5.T2HW=Indicador(coda.glmnet.5$taxa.num[coda.glmnet.5$`log-contrast coefficients`<0]))
set.seed(5693)
M=10
repls=aldexCesc.clr(t(DF), conds=Type, mc.samples=M)@analysisData
## [1] "operating in serial mode"
repls=t(cbind(repls$CYM_T2C_1,repls$CYM_T2C_2,repls$CYM_T2C_3,repls$CYM_T2HW_1,repls$CYM_T2HW_1,repls$CYM_T2HW_3))
attr(repls,"dimnames")[[2]]=NULL
colnames(repls)=colnames(DF)
row.names(repls)=c(paste("CYM_T2C_1",1:M,sep="_"),
paste("CYM_T2C_2",1:M,sep="_"),
paste("CYM_T2C_3",1:M,sep="_"),
paste("CYM_T2HW_1",1:M,sep="_"),
paste("CYM_T2HW_2",1:M,sep="_"),
paste("CYM_T2HW_3",1:M,sep="_"))
mostres=rbind(repls,easyCODA::CLR(DF)$LR)
rm(repls)
Type_ext=as.factor(c(rep("CYM_T2C", 3*M),rep("CYM_T2HW", 3*M),rep("CYM_T2C", 3),rep("CYM_T2HW", 3)))
colors_ext=c("purple","red")[Type_ext]
LogRat_mostres=logratios_matrix_clr(mostres)
saveRDS(LogRat_mostres, file="LogRat_mostres_CYM_2C2HW_10copies.RData")
LogRat_mostres=readRDS("LogRat_mostres_CYM_2C2HW_10copies.RData")
LogRat_mostres[[1]]=LogRat_mostres[[1]][c(1:60,67:72),]
LogRat_mostres=list(LogRat_mostres[[1]],LogRat_mostres[[2]])
coda.glmnet.10=coda_glmnet_lr_bin(LogRat_mostres,Type_ext,taxa,showPlots = FALSE)
impGLMNET_ext.10=coda.glmnet.10$taxa.num
HC=ClusterHC(DF[,impGLMNET_ext.10],dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)
glmnet.ext.10=c(length(impGLMNET_ext.10),Sep(HC))
With 10 replicates, 82 significant taxa are found, perfectly separating the two groups, with a separation of 3.09.
Significant=cbind(Significant,
glmnet_sign.ext.10.T2C=Indicador(coda.glmnet.10$taxa.num[coda.glmnet.10$`log-contrast coefficients`>0]),
glmnet_sign.ext.10.T2HW=Indicador(coda.glmnet.10$taxa.num[coda.glmnet.10$`log-contrast coefficients`<0]))
> initial_seed=as.integer(Sys.time())
> print (initial_seed)
# [1] 1711815195
initial_seed %% 10000
# [1] 5195
set.seed(5195)
repl_kw=aldexCesc.clr(t(DF), Type=Type, mc.samples=1000, verbose=FALSE)
aldex_kw=aldex.kw(repl_kw, verbose=FALSE)
#
saveRDS(repl_kw, file="repl_kw_2C2HW.RData")
saveRDS(aldex_kw, file="aldex_kw_2C2HW.RData")
p.valores_kw=readRDS("aldex_kw_2C2HW.RData")
length(p.valores_kw[p.valores_kw[,1]<0.01,1])
## [1] 0
length(p.valores_kw[p.valores_kw[,1]<0.05,1])
## [1] 29
length(p.valores_kw[p.valores_kw[,2]<0.01,2])
## [1] 0
length(p.valores_kw[p.valores_kw[,2]<0.01,2])
## [1] 0
sep.kw.noadjust.0.05=QQ.HC.noadjust(p.valores_kw,DF,Type,q=0.05,1,BP=TRUE)
There are no taxa with a Kruskal–Wallis adjusted p-value < 0.05. There are 29 with an unadjusted p-value < 0.05 (none < 0.01); in the absence of a better option, we select them. They perfectly separate the two groups, with a separation of 3.11.
Significant=cbind(Significant,
Aldex.kw.noadjust.0.05=Indicador(sep.kw.noadjust.0.05))
DF.prop=DF.0.original/rowSums(DF.0.original)
SMP.prop=simper(DF.prop,group=Type)
indexos_SMP=SMP.prop$CYM_T2C_CYM_T2HW$ord[which(SMP.prop$CYM_T2C_CYM_T2HW$cusum<=0.7)]
OTUs_SMP=sort(attr(SMP.prop$CYM_T2C_CYM_T2HW$cusum[which(SMP.prop$CYM_T2C_CYM_T2HW$cusum<=0.7)],"names"))
OTUs_SMP.good=setdiff(OTUs_SMP,Unics$OTUs)
HC=ClusterHC(DF[,OTUs_SMP.good],dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)
simper.props=c(length(indexos_SMP),Sep(HC))
We obtain 120 significant taxa, which perfectly separate the two sample types.They include OTU2121, which we had previously removed because it appeared in only one sample. The separation between groups is 1.42.
Indicador.Gl=function(x,k=n.OTUs.original){
xx=rep(0,k)
xx[x]=1
return(xx)
}
Significant=cbind(Significant,
simper.rel=Indicador.Gl(indexos_SMP))
Significant=cbind(Significant,
t(DF.prop))
write.csv2(Significant,"OTU_Significativos_CYM_T2.csv",row.names=FALSE)
Resultados=rbind(
LR.max,
glmnet.ext.3,
glmnet.ext.5,
glmnet.ext.10,
sep.kw.noadjust.0.05,
simper.props
)
row.names(Resultados)=c(
"LR",
"glmnet.ext.3",
"glmnet.ext.5",
"glmnet.ext.10",
"Aldex",
"simper")
colnames(Resultados)=c("Size","Separation")
par(opar)
fit = smooth.spline(F.SS[,1],F.SS[,3],cv=TRUE)
fit.low = smooth.spline(F.SS[,1],F.SS[,4],cv=TRUE)
fit.up = smooth.spline(F.SS[,1],F.SS[,5],cv=TRUE)
plot(F.SS[,c(1,3)],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Separation",main="Average separation for perfect classifiers",xlim=c(0,600),xaxp=c(0,600,12),yaxp=c(0,6,20),ylim=c(0,6))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)
points(Resultados,col="blue",type="h")
points(Resultados,col="blue",pch=20)
text(Resultados[,1],Resultados[,2]+0.01,col="blue",labels=row.names(Resultados),cex=0.75)
Finally, we intersect the OTUs identified as significant according to LR and coda_glmnet with 3 replicates
Significant.capat=Significant[,c(1,2,3,4,5)]
Significant.capat=cbind(Significant.capat,Cuánto=rowSums(Significant.capat[,3:dim(Significant.capat)[2]]))
Significant.capat=Significant.capat[Significant.capat$Cuánto==2,]
DF.Imp=DF[,Significant.capat[,1]]
HC=ClusterHC(DF.Imp,dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)
There are 60 taxa. They are:
Significant.capat[,1:2]%>%
kbl() %>%
kable_styling()
| OTUs | taxa | |
|---|---|---|
| OTU4 | OTU4 | d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Flavobacteriales;f__Flavobacteriaceae;g__Robiginitalea;s__bacterium_AMSU |
| OTU6 | OTU6 | d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Flavobacteriales;f__Flavobacteriaceae;g__Maritimimonas;s__uncultured_Bacteroidetes |
| OTU19 | OTU19 | d__Bacteria;p__Chloroflexi;c__Anaerolineae;o__Anaerolineales;f__Anaerolineaceae;guncultured; |
| OTU25 | OTU25 | d__Archaea;p__Asgardarchaeota;c__Heimdallarchaeia;o__Heimdallarchaeia;f__Heimdallarchaeia;g__Heimdallarchaeia;s__uncultured_archaeon |
| OTU26 | OTU26 | d__Bacteria;p__Chloroflexi;c__Anaerolineae;o__SBR1031;f__SBR1031;g__SBR1031;s__uncultured_organism |
| OTU27 | OTU27 | d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Flavobacteriales;f__Flavobacteriaceae;g__Ulvibacter;s__Aureitalea_sp. |
| OTU31 | OTU31 | d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;oEctothiorhodospirales;;; |
| OTU34 | OTU34 | d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Flavobacteriales;f__Flavobacteriaceae;gMaribacter; |
| OTU48 | OTU48 | d__Bacteria;p__Patescibacteria;c__Parcubacteria;;;; |
| OTU56 | OTU56 | d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Arenicellales;f__Arenicellaceae;guncultured; |
| OTU78 | OTU78 | d__Bacteria;p__Spirochaetota;c__Spirochaetia;o__Spirochaetales;f__Spirochaetaceae;gSpirochaeta; |
| OTU85 | OTU85 | d__Bacteria;pChloroflexi;;;;; |
| OTU94 | OTU94 | d__Bacteria;p__Spirochaetota;c__Spirochaetia;o__Spirochaetales;f__Spirochaetaceae;g__Spirochaeta;s__uncultured_bacterium |
| OTU102 | OTU102 | d__Bacteria;p__Bacteroidota;c__Ignavibacteria;o__Ignavibacteriales;f__Melioribacteraceae;g__IheB3-7;s__uncultured_Chlorobi |
| OTU107 | OTU107 | d__Bacteria;p__Actinobacteriota;c__WCHB1-81;o__WCHB1-81;f__WCHB1-81;gWCHB1-81; |
| OTU118 | OTU118 | d__Archaea;p__Thermoplasmatota;c__Thermoplasmata;o__Marine_Benthic_Group_D_and_DHVEG-1;f__Marine_Benthic_Group_D_and_DHVEG-1;g__Marine_Benthic_Group_D_and_DHVEG-1;s__uncultured_archaeon |
| OTU123 | OTU123 | d__Bacteria;p__Planctomycetota;c__Planctomycetes;o__Pirellulales;f__Pirellulaceae;gPir4_lineage; |
| OTU126 | OTU126 | d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Chitinophagales;f__Saprospiraceae;guncultured; |
| OTU133 | OTU133 | d__Bacteria;p__Desulfobacterota;c__Desulfobacteria;o__Desulfatiglandales;f__Desulfatiglandaceae;gDesulfatiglans; |
| OTU136 | OTU136 | d__Bacteria;p__Patescibacteria;c__Parcubacteria;o__Candidatus_Kaiserbacteria;f__Candidatus_Kaiserbacteria;g__Candidatus_Kaiserbacteria;s__uncultured_bacterium |
| OTU137 | OTU137 | d__Archaea;p__Thermoplasmatota;c__Thermoplasmata;o__Marine_Benthic_Group_D_and_DHVEG-1;f__Marine_Benthic_Group_D_and_DHVEG-1;gMarine_Benthic_Group_D_and_DHVEG-1; |
| OTU143 | OTU143 | d__Archaea;p__Crenarchaeota;c__Bathyarchaeia;o__Bathyarchaeia;f__Bathyarchaeia;gBathyarchaeia; |
| OTU144 | OTU144 | d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Flavobacteriales;f__Flavobacteriaceae;gAquibacter; |
| OTU163 | OTU163 | d__Bacteria;p__Modulibacteria;c__Moduliflexia;o__Moduliflexales;f__Moduliflexaceae;g__Moduliflexaceae;s__uncultured_bacterium |
| OTU168 | OTU168 | d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Thiomicrospirales;f__Thiomicrospiraceae;gendosymbionts; |
| OTU174 | OTU174 | d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__SB-5;g__SB-5;s__bacterium_YC-LK-LKJ31 |
| OTU178 | OTU178 | d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Flavobacteriales;f__Flavobacteriaceae;gWinogradskyella; |
| OTU181 | OTU181 | d__Bacteria;p__Chloroflexi;c__Anaerolineae;;;; |
| OTU182 | OTU182 | d__Bacteria;p__Chloroflexi;c__Anaerolineae;o__uncultured;f__uncultured;guncultured; |
| OTU186 | OTU186 | d__Bacteria;p__LCP-89;c__LCP-89;o__LCP-89;f__LCP-89;g__LCP-89;s__uncultured_bacterium |
| OTU200 | OTU200 | d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Chitinophagales;f__Saprospiraceae;; |
| OTU212 | OTU212 | d__Bacteria;p__Bdellovibrionota;c__Bdellovibrionia;o__Bacteriovoracales;f__Bacteriovoracaceae;g__uncultured;s__uncultured_bacterium |
| OTU216 | OTU216 | d__Bacteria;p__Actinobacteriota;c__WCHB1-81;o__WCHB1-81;f__WCHB1-81;g__WCHB1-81;s__uncultured_bacterium |
| OTU224 | OTU224 | d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Chromatiales;f__Sedimenticolaceae;g__uncultured;s__uncultured_bacterium |
| OTU241 | OTU241 | d__Bacteria;p__Chloroflexi;c__Anaerolineae;o__Anaerolineales;f__Anaerolineaceae;g__uncultured;s__uncultured_Chloroflexi |
| OTU283 | OTU283 | d__Bacteria;p__Calditrichota;c__Calditrichia;o__Calditrichales;f__Calditrichaceae;; |
| OTU296 | OTU296 | d__Bacteria;p__Patescibacteria;c__Parcubacteria;o__Candidatus_Kaiserbacteria;f__Candidatus_Kaiserbacteria;g__Candidatus_Kaiserbacteria;s__uncultured_prokaryote |
| OTU309 | OTU309 | d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Chitinophagales;f__Saprospiraceae;gLewinella; |
| OTU329 | OTU329 | d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Flavobacteriales;f__Flavobacteriaceae;g__Aquibacter;s__uncultured_marine |
| OTU347 | OTU347 | d__Bacteria;p__Chloroflexi;c__Dehalococcoidia;;;; |
| OTU372 | OTU372 | d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Oceanospirillales;f__Pseudohongiellaceae;gPseudohongiella; |
| OTU412 | OTU412 | d__Bacteria;p__Desulfobacterota;c__Desulfobulbia;oDesulfobulbales;;; |
| OTU426 | OTU426 | d__Bacteria;p__Verrucomicrobiota;c__Omnitrophia;o__Omnitrophales;f__Omnitrophaceae;g__Candidatus_Omnitrophus;s__uncultured_bacterium |
| OTU433 | OTU433 | d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Chitinophagales;f__Saprospiraceae;g__Lewinella;s__uncultured_marine |
| OTU490 | OTU490 | d__Bacteria;p__Bdellovibrionota;c__Bdellovibrionia;o__Bacteriovoracales;f__Bacteriovoracaceae;gPeredibacter; |
| OTU516 | OTU516 | d__Bacteria;p__Bdellovibrionota;c__Bdellovibrionia;o__Bdellovibrionales;f__Bdellovibrionaceae;gBdellovibrio; |
| OTU565 | OTU565 | d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Chitinophagales;f__Saprospiraceae;g__Portibacter;s__uncultured_Bacteroidetes |
| OTU574 | OTU574 | d__Bacteria;p__Planctomycetota;c__Planctomycetes;o__Pirellulales;f__Pirellulaceae;g__uncultured;s__uncultured_organism |
| OTU597 | OTU597 | d__Bacteria;p__Acidobacteriota;c__Thermoanaerobaculia;o__Thermoanaerobaculales;f__Thermoanaerobaculaceae;g__Subgroup_23;s__uncultured_bacterium |
| OTU622 | OTU622 | d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Chitinophagales;f__uncultured;g__uncultured;s__uncultured_Bacteroidetes |
| OTU625 | OTU625 | d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__pItb-vmat-80;f__pItb-vmat-80;gpItb-vmat-80; |
| OTU659 | OTU659 | d__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rickettsiales;f__Mitochondria;g__Mitochondria;s__uncultured_bacterium |
| OTU785 | OTU785 | d__Archaea;p__Asgardarchaeota;c__Odinarchaeia;o__Odinarchaeia;f__Odinarchaeia;g__Odinarchaeia;s__uncultured_crenarchaeote |
| OTU854 | OTU854 | d__Bacteria;p__Verrucomicrobiota;c__Verrucomicrobiae;o__Opitutales;f__Puniceicoccaceae;gLentimonas; |
| OTU980 | OTU980 | d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Chitinophagales;f__Saprospiraceae;g__Phaeodactylibacter;s__uncultured_organism |
| OTU1088 | OTU1088 | d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Cellvibrionales;f__Halieaceae;gCongregibacter; |
| OTU1502 | OTU1502 | d__Archaea;p__Thermoplasmatota;c__Thermoplasmatota;o__Thermoplasmatota;f__Thermoplasmatota;guncultured; |
| OTU1532 | OTU1532 | d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Chromatiales;f__Chromatiaceae;g__Candidatus_Thiobios;s__uncultured_organism |
| OTU1539 | OTU1539 | d__Bacteria;p__Acidobacteriota;c__Vicinamibacteria;o__Subgroup_17;f__Subgroup_17;gSubgroup_17; |
| OTU2370 | OTU2370 | d__Bacteria;p__Bdellovibrionota;c__Bdellovibrionia;o__Bacteriovoracales;f__Bacteriovoracaceae;g__uncultured;s__uncultured_deep-sea |
The separation between groups is 4.31.
We update the plot with the separations:
intersección=c(dim(Significant.capat)[1],Sep(HC))
Resultados=rbind(Resultados,intersección)
row.names(Resultados)=c(
"LR",
"glmnet.ext.3",
"glmnet.ext.5",
"glmnet.ext.10",
"Aldex",
"simper",
"Intersection")
colnames(Resultados)=c("Size","Separation")
plot(F.SS[-1,c(1,3)],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Separation",main="Average separation for perfect classifiers",xlim=c(0,150),xaxp=c(0,150,15),yaxp=c(0,6,12),ylim=c(0,6))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)
points(Resultados[-dim(Resultados)[1],],col="blue",type="h")
points(Resultados[-dim(Resultados)[1],],col="blue",pch=20)
points(intersección[1],intersección[2],col="purple",type="h")
points(intersección[1],intersección[2],col="purple",pch=20)
text(Resultados[-dim(Resultados)[1],1],Resultados[-dim(Resultados)[1],2]+0.1,col="blue",labels=row.names(Resultados)[-dim(Resultados)[1]],cex=0.75)
text(intersección[1],intersección[2]+0.1,col="purple",labels="intersección",cex=0.75)
abline(h=base,col="green")
text(20,base+0.1,col="green",labels="Global",cex=0.75)
Resultados %>%
kbl() %>%
kable_styling()
| Size | Separation | |
|---|---|---|
| LR | 563 | 2.150788 |
| glmnet.ext.3 | 81 | 3.668474 |
| glmnet.ext.5 | 84 | 3.399211 |
| glmnet.ext.10 | 82 | 3.085298 |
| Aldex | 29 | 3.114325 |
| simper | 120 | 1.415174 |
| Intersection | 60 | 4.306244 |
tax_otu=as.data.frame(read_excel("TablaOTUs.xlsx"))[1:12,]
Samples=tax_otu$ID
taxa=colnames(tax_otu[,3:dim(tax_otu)[2]])
OTUs=paste("OTU",1:(dim(tax_otu)[2]-2),sep="")
DF.0=tax_otu[,-c(1,2)]
row.names(DF.0)=Samples
colnames(DF.0)=OTUs
taxa.original=taxa
DF.0.original=DF.0
n.OTUs.original=dim(DF.0)[2]
Miseria=as.vector(which(colSums(DF.0[1:6,])<=3))
DF.0=DF.0[,-Miseria]
taxa=taxa[-Miseria]
OTUs=OTUs[-Miseria]
hit=function(x){min(c(x,1))}
DF.0.hits=as.matrix(DF.0)
DF.0.hits[]=vapply(DF.0.hits, hit, numeric(1))
Unics=data.frame(
Sample=rep(NA,length(which(colSums(DF.0.hits)==1))), OTUs=names(colSums(DF.0)[which(colSums(DF.0.hits)==1)]), Reads=colSums(DF.0)[which(colSums(DF.0.hits)==1)],
Proportions=rep(0,length(which(colSums(DF.0.hits)==1)))
)
for (i in 1:length(Unics$OTUs)){
ii=which(DF.0.hits[,Unics$OTUs[i]]==1)
Unics$Proportions[i]=DF.0[ii,Unics$OTUs[i]]/sum(DF.0[ii,])
Unics$Sample[i]=Samples[ii]}
Unics.Freqs=Unics[Unics$Proportions>=0.05/100,]
row.names(Unics.Freqs)=NULL
DF.0=DF.0[,-which(colSums(DF.0.hits)==1)]
taxa=taxa[-which(colSums(DF.0.hits)==1)]
OTUs=OTUs[-which(colSums(DF.0.hits)==1)]
row.names(DF.0)=Samples
DF=zCompositions::cmultRepl(DF.0,method="GBM",output="p-counts",suppress.print=TRUE,z.warning=0.99)
DF=DF[1:6,]
DF.0=DF.0[1:6,]
DF.0.original=DF.0.original[1:6 ,]
Type=as.factor(c(rep("CAU_T0", 3),rep("CAU_T1", 3)))
colors=c("blue","green")[Type]
After removing OTUs with 2 or fewer reads across the samples and those appearing in only one of these samples, 1194 taxa remain.
The OTUs removed at this step that represent more than 0.05% of the sample in which they appear are:
Unics.Freqs[rev(order(Unics.Freqs$Proportions)),] %>%
kbl() %>%
kable_styling()
| Sample | OTUs | Reads | Proportions |
|---|---|---|---|
| CAU_T0_2 | OTU966 | 55 | 0.0012226 |
rm(tax_otu)
rm(DF.0.hits)
rm(Miseria)
rm(Unics.Freqs)
HC=ClusterHC(DF,Grups=Type,colores=colors)
base=Sep(HC)
The two sample types are not initially separated.
We now examine the proportion of OTU subsets of reasonable size that already classify them correctly.
> initial_seed=as.integer(Sys.time())
> print (initial_seed)
# [1] 1717749309
initial_seed %% 10000
# [1] 9309
set.seed(9309)
Experimento=function(i)
{
ii=sample(dim(DF)[2],i,rep=FALSE)
DFtemp=DF[,ii]
CC=ClusterHC(DFtemp,dendrograma=FALSE,barplot=FALSE, Grups=Type,colores=colors)
c(Encerts(CC$tabla,table(Type)),Sep(CC))
}
n=500
F.SS=c()
for (i in 10:600)
{
print(i)
EE=replicate(n,Experimento(i))
EE.2=EE[2,which(EE[1,]==0)]
if(length(which(EE[1,]==0))>0){
XX=replicate(1000,mean(sample(EE.2,n,replace=TRUE)))
F.SS=rbind(F.SS, c(i,
length(EE.2)/250,
mean(EE.2),
quantile(XX,0.025),
quantile(XX,0.975)
))}
else {
F.SS=rbind(F.SS, c(i,
0,
NA,
NA,
NA))
}
}
colnames(F.SS)=c("n","Proportion","Avg. sep.","Q_0.025","Q_0.975")
saveRDS(F.SS, file="Fraccio.Separadors.CAU.T0vsT1.RData")
F.SS=readRDS("Fraccio.Separadors.CAU.T0vsT1.RData")
The first plot shows, for each \(n\) up to 500, the proportion of samples of size \(n\) that perfectly classify CAU_T0 and CAU_T1 (black curve), along with the 95% Clopper–Pearson confidence interval for that proportion (red curves). We used smoothing splines to smooth the curves.
F.SS_prop.Q_0.025=epitools::binom.exact(F.SS[,2]*250,250)$lower
F.SS_prop.Q_0.975=epitools::binom.exact(F.SS[,2]*250,250)$upper
fit = smooth.spline(F.SS[,1],F.SS[,2],cv=TRUE)
fit.low = smooth.spline(F.SS[,1],F.SS_prop.Q_0.025,cv=TRUE)
fit.up = smooth.spline(F.SS[,1],F.SS_prop.Q_0.975,cv=TRUE)
plot(F.SS[,1:2],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Proportion",main="Proportion of perfect classifiers",ylim=c(0,0.1),xlim=c(0,600),xaxp=c(0,600,12),yaxp=c(0,0.1,10))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)
The second plot shows, for each \(n\) up to 600, the estimated mean separation obtained with a sample of size \(n\) that perfectly classifies CAU_T0 and CAU_T1 (black curve), along with the 95% confidence interval of this mean separation obtained via bootstrap (red curve).
fit = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),3],cv=TRUE)
fit.low = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),4],cv=TRUE)
fit.up = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),5],cv=TRUE)
plot(F.SS[,c(1,3)],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Separation",main="Average separation for perfect classifiers",xlim=c(0,600),xaxp=c(0,600,12),yaxp=c(0,1.5,15))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)
`
LRS=explore_logratios(DF,Type)
saveRDS(LRS, file="LRS_CAU_T0T1.RData")
LRS=readRDS("LRS_CAU_T0T1.RData")
assoc=rep(0,dim(DF)[2]-4)
for (m in 5:dim(DF)[2]){
assoc[m-4]=sum(LRS$`association log-ratio with y`[1:m,1:m])/m^2
}
plot(5:dim(DF)[2],assoc,type="b",xlab="m",ylab="Average goodness of the m most relevant." ,pch=20,cex=0.5)
m=4+which.max(assoc)
impLR=sort(as.numeric(LRS$`order of importance`[1:m]))
HC=ClusterHC(DF[,impLR],barplot=FALSE,Grups=Type,colores=colors)
LR.max=c(length(impLR),Sep(HC))
The maximum is attained at the set of 24 most logratio-relevant taxa.
They perfectly separate the two sample types. The separation between groups is 2.3.
Significant=data.frame(
OTUs=paste("OTU",1:n.OTUs.original,sep=""),
taxa=taxa.original,
Log_ratio.max=Indicador(impLR))
coda_glmnet)coda.glmnet=coda_glmnet(DF,Type,showPlots = FALSE)
## [1] "No variables are selected. The prediction and the signature plots are not displayed."
coda.glmnet$`signature plot`
## NULL
coda_glmnet does not find taxa with significant differences. We add random replicates. We test with 3, 5, and 10 replicates.
> initial_seed=as.integer(Sys.time())
> print (initial_seed)
# [1] 1717764843
initial_seed %% 10000
# [1] 4843
# 3 replicates
set.seed(4843)
M=3
repls=aldexCesc.clr(t(DF), conds=Type, mc.samples=M)@analysisData
## [1] "operating in serial mode"
repls=t(cbind(repls$CAU_T0_1,repls$CAU_T0_2,repls$CAU_T0_3,repls$CAU_T1_1,repls$CAU_T1_2,repls$CAU_T1_3))
attr(repls,"dimnames")[[2]]=NULL
colnames(repls)=colnames(DF)
row.names(repls)=c(paste("CAU_T0_1",1:M,sep="_"),
paste("CAU_T0_2",1:M,sep="_"),
paste("CAU_T0_3",1:M,sep="_"),
paste("CAU_T1_1",1:M,sep="_"),
paste("CAU_T1_2",1:M,sep="_"),
paste("CAU_T1_3",1:M,sep="_"))
mostres=rbind(repls,easyCODA::CLR(DF)$LR)
rm(repls)
Type_ext=as.factor(c(rep("CAU_T0", 3*M),rep("CAU_T1", 3*M),rep("CAU_T0", 3),rep("CAU_T1", 3)))
colors_ext=c("blue","green")[Type_ext]
LogRat_mostres=logratios_matrix_clr(mostres)
LogRat_mostres=list(LogRat_mostres[[1]],LogRat_mostres[[2]])
saveRDS(LogRat_mostres, file="LogRat_mostres_CAU_T0T1_3copies.RData")
LogRat_mostres=readRDS("LogRat_mostres_CAU_T0T1_3copies.RData")
coda.glmnet.3=coda_glmnet_lr_bin(LogRat_mostres,Type_ext,DF,showPlots = FALSE)
impGLMNET_ext.3=coda.glmnet.3$taxa.num
HC=ClusterHC(DF[,impGLMNET_ext.3],dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)
glmnet.ext.3=c(length(impGLMNET_ext.3),Sep(HC))
With 3 replicates, 45 significant taxa are found, perfectly separating the two groups, with a separation of 0.95.
Significant=cbind(Significant,
glmnet_sign.ext.3.T0=Indicador(coda.glmnet.3$taxa.num[coda.glmnet.3$`log-contrast coefficients`>0]),
glmnet_sign.ext.3.T1=Indicador(coda.glmnet.3$taxa.num[coda.glmnet.3$`log-contrast coefficients`<0]))
# 5 replicates
set.seed(4843)
M=5
repls=aldexCesc.clr(t(DF), conds=Type, mc.samples=M)@analysisData
## [1] "operating in serial mode"
repls=t(cbind(repls$CAU_T0_1,repls$CAU_T0_2,repls$CAU_T0_3,repls$CAU_T1_1,repls$CAU_T1_2,repls$CAU_T1_3))
attr(repls,"dimnames")[[2]]=NULL
colnames(repls)=colnames(DF)
row.names(repls)=c(paste("CAU_T0_1",1:M,sep="_"),
paste("CAU_T0_2",1:M,sep="_"),
paste("CAU_T0_3",1:M,sep="_"),
paste("CAU_T1_1",1:M,sep="_"),
paste("CAU_T1_2",1:M,sep="_"),
paste("CAU_T1_3",1:M,sep="_"))
mostres=rbind(repls,easyCODA::CLR(DF)$LR)
rm(repls)
Type_ext=as.factor(c(rep("CAU_T0", 3*M),rep("CAU_T1", 3*M),rep("CAU_T0", 3),rep("CAU_T1", 3)))
colors_ext=c("blue","green")[Type_ext]
LogRat_mostres=logratios_matrix_clr(mostres)
LogRat_mostres=list(LogRat_mostres[[1]],LogRat_mostres[[2]])
saveRDS(LogRat_mostres, file="LogRat_mostres_CAU_T0T1_5copies.RData")
LogRat_mostres=readRDS("LogRat_mostres_CAU_T0T1_5copies.RData")
coda.glmnet.5=coda_glmnet_lr_bin(LogRat_mostres,Type_ext,DF,showPlots = FALSE)
impGLMNET_ext.5=coda.glmnet.5$taxa.num
HC=ClusterHC(DF[,impGLMNET_ext.5],dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)
glmnet.ext.5=c(length(impGLMNET_ext.5),Sep(HC))
With 5 replicates, 51 significant taxa are found, perfectly separating the two groups, with a separation of 0.88.
Significant=cbind(Significant,
glmnet_sign.ext.5.T0=Indicador(coda.glmnet.5$taxa.num[coda.glmnet.5$`log-contrast coefficients`>0]),
glmnet_sign.ext.5.T1=Indicador(coda.glmnet.5$taxa.num[coda.glmnet.5$`log-contrast coefficients`<0]))
# 10 replicates
set.seed(4843)
M=10
repls=aldexCesc.clr(t(DF), conds=Type, mc.samples=M)@analysisData
## [1] "operating in serial mode"
repls=t(cbind(repls$CAU_T0_1,repls$CAU_T0_2,repls$CAU_T0_3,repls$CAU_T1_1,repls$CAU_T1_2,repls$CAU_T1_3))
attr(repls,"dimnames")[[2]]=NULL
colnames(repls)=colnames(DF)
row.names(repls)=c(paste("CAU_T0_1",1:M,sep="_"),
paste("CAU_T0_2",1:M,sep="_"),
paste("CAU_T0_3",1:M,sep="_"),
paste("CAU_T1_1",1:M,sep="_"),
paste("CAU_T1_2",1:M,sep="_"),
paste("CAU_T1_3",1:M,sep="_"))
mostres=rbind(repls,easyCODA::CLR(DF)$LR)
rm(repls)
Type_ext=as.factor(c(rep("CAU_T0", 3*M),rep("CAU_T1", 3*M),rep("CAU_T0", 3),rep("CAU_T1", 3)))
colors_ext=c("blue","green")[Type_ext]
LogRat_mostres=logratios_matrix_clr(mostres)
LogRat_mostres=list(LogRat_mostres[[1]],LogRat_mostres[[2]])
saveRDS(LogRat_mostres, file="LogRat_mostres_CAU_T0T1_10copies.RData")
LogRat_mostres=readRDS("LogRat_mostres_CAU_T0T1_10copies.RData")
coda.glmnet.10=coda_glmnet_lr_bin(LogRat_mostres,Type_ext,DF,showPlots = FALSE)
impGLMNET_ext.10=coda.glmnet.10$taxa.num
HC=ClusterHC(DF[,impGLMNET_ext.10],dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)
glmnet.ext.10=c(length(impGLMNET_ext.10),Sep(HC))
With 10 replicates, 49 significant taxa are found, but they do not classify well the samples.
Significant=cbind(Significant,
glmnet_sign.ext.10.T0=Indicador(coda.glmnet.5$taxa.num[coda.glmnet.10$`log-contrast coefficients`>0]),
glmnet_sign.ext.10.T1=Indicador(coda.glmnet.5$taxa.num[coda.glmnet.10$`log-contrast coefficients`<0]))
> initial_seed=as.integer(Sys.time())
> print (initial_seed)
# [1] 1717772151
initial_seed %% 10000
# [1] 2151
set.seed(2151)
repl_kw=aldexCesc.clr(t(DF), conds=Type, mc.samples=1000, verbose=FALSE)
aldex_kw=aldex.kw(repl_kw, verbose=FALSE)
#
saveRDS(aldex_kw, file="aldex_kw_CAU_T0T1.RData")
p.valores_kw=readRDS("aldex_kw_CAU_T0T1.RData")
length(p.valores_kw[p.valores_kw[,1]<0.05,2])
## [1] 0
There are no taxa with a p-value < 0.05, even without adjustment.
The significant taxa identified by simper do not classify the samples well:
DF.prop=DF.0.original/rowSums(DF.0.original)
SMP.prop=simper(DF.prop,group=Type)
indexos_SMP=SMP.prop$CAU_T0_CAU_T1$ord[which(SMP.prop$CAU_T0_CAU_T1$cusum<=0.7)]
OTUs_SMP=sort(attr(SMP.prop$CAU_T0_CAU_T1$cusum[which(SMP.prop$CAU_T0_CAU_T1$cusum<=0.7)],"names"))
OTUs_SMP.good=setdiff(OTUs_SMP,Unics$OTUs)
HC=ClusterHC(DF[,OTUs_SMP.good],dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)
Significant=cbind(Significant,
t(DF.prop))
write.csv2(Significant,"OTU_Significativos_CAU_T0T1.csv",row.names=FALSE)
Resultados=rbind(
LR.max,
glmnet.ext.3,
glmnet.ext.5,
glmnet.ext.10
)
row.names(Resultados)=c(
"LR",
"glmnet.ext.3",
"glmnet.ext.5",
"glmnet.ext.10")
colnames(Resultados)=c("Size","Separation")
par(opar)
fit = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),3],cv=TRUE)
fit.low = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),4],cv=TRUE)
fit.up = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),5],cv=TRUE)
plot(F.SS[,c(1,3)],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Separation",main="Average separation for perfect classifiers",xlim=c(0,650),xaxp=c(0,650,13),ylim=c(0,max(Resultados[,2])+0.1))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)
points(Resultados,col="blue",type="h")
points(Resultados,col="blue",pch=20)
text(Resultados[,1],Resultados[,2]+0.01,col="blue",labels=row.names(Resultados),cex=0.75)
We intersect the OTUs identified as significant according to LR and coda_glmnet with 3 replicates
Significant.capat=Significant[,c(1,2,3,4,5)]
Significant.capat=cbind(Significant.capat,Cuánto=rowSums(Significant.capat[,3:dim(Significant.capat)[2]]))
Significant.capat=Significant.capat[Significant.capat$Cuánto==2,]
Significant.capat[,1:2]%>%
kbl() %>%
kable_styling()
| OTUs | taxa | |
|---|---|---|
| OTU952 | OTU952 | d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Beggiatoales;f__Beggiatoaceae;g__Thioflexothrix;s__Thiotrichales_bacterium |
A single OTU. The same happens with 5 replicates:
Significant.capat=Significant[,c(1,2,3,6,7)]
Significant.capat=cbind(Significant.capat,Cuánto=rowSums(Significant.capat[,3:dim(Significant.capat)[2]]))
Significant.capat=Significant.capat[Significant.capat$Cuánto==2,]
Significant.capat[,1:2]%>%
kbl() %>%
kable_styling()
| OTUs | taxa | |
|---|---|---|
| OTU828 | OTU828 | d__Bacteria;p__Verrucomicrobiota;c__Verrucomicrobiae;o__Verrucomicrobiales;f__Rubritaleaceae;g__Roseibacillus;s__uncultured_organism |
Resultados %>%
kbl() %>%
kable_styling()
| Size | Separation | |
|---|---|---|
| LR | 24 | 2.2969800 |
| glmnet.ext.3 | 45 | 0.9462113 |
| glmnet.ext.5 | 51 | 0.8765157 |
| glmnet.ext.10 | 49 | 0.8640208 |
tax_otu=as.data.frame(read_excel("TablaOTUs.xlsx"))[13:24,]
Samples=tax_otu$ID
taxa=colnames(tax_otu[,3:dim(tax_otu)[2]])
OTUs=paste("OTU",1:(dim(tax_otu)[2]-2),sep="")
DF.0=tax_otu[,-c(1,2)]
row.names(DF.0)=Samples
colnames(DF.0)=OTUs
taxa.original=taxa
DF.0.original=DF.0
n.OTUs.original=dim(DF.0)[2]
Miseria=as.vector(which(colSums(DF.0[1:6,])<=3))
DF.0=DF.0[,-Miseria]
taxa=taxa[-Miseria]
OTUs=OTUs[-Miseria]
hit=function(x){min(c(x,1))}
DF.0.hits=as.matrix(DF.0)
DF.0.hits[]=vapply(DF.0.hits, hit, numeric(1))
Unics=data.frame(
Sample=rep(NA,length(which(colSums(DF.0.hits)==1))), OTUs=names(colSums(DF.0)[which(colSums(DF.0.hits)==1)]), Reads=colSums(DF.0)[which(colSums(DF.0.hits)==1)],
Proportions=rep(0,length(which(colSums(DF.0.hits)==1)))
)
for (i in 1:length(Unics$OTUs)){
ii=which(DF.0.hits[,Unics$OTUs[i]]==1)
Unics$Proportions[i]=DF.0[ii,Unics$OTUs[i]]/sum(DF.0[ii,])
Unics$Sample[i]=Samples[ii]}
Unics.Freqs=Unics[Unics$Proportions>=0.05/100,]
row.names(Unics.Freqs)=NULL
DF.0=DF.0[,-which(colSums(DF.0.hits)==1)]
taxa=taxa[-which(colSums(DF.0.hits)==1)]
OTUs=OTUs[-which(colSums(DF.0.hits)==1)]
row.names(DF.0)=Samples
DF=zCompositions::cmultRepl(DF.0,method="GBM",output="p-counts",suppress.print=TRUE,z.warning=0.99)
DF=DF[1:6,]
DF.0=DF.0[1:6,]
DF.0.original=DF.0.original[1:6 ,]
Type=as.factor(c(rep("CYM_T0", 3),rep("CYM_T1", 3)))
colors=c("blue","green")[Type]
After removing OTUs with 2 or fewer reads across the samples and those appearing in only one of these samples, 1658 taxa remain.
The OTUs removed at this step that represent more than 0.05% of the sample in which they appear are:
Unics.Freqs[rev(order(Unics.Freqs$Proportions)),] %>%
kbl() %>%
kable_styling()
| Sample | OTUs | Reads | Proportions |
|---|---|---|---|
rm(tax_otu)
rm(DF.0.hits)
rm(Miseria)
rm(Unics.Freqs)
HC=ClusterHC(DF,Grups=Type,colores=colors)
base=Sep(HC)
The two sample types are not initially separated.
We now examine the proportion of OTU subsets of reasonable size that classify them correctly.
> initial_seed=as.integer(Sys.time())
> print (initial_seed)
# [1] 1717668665
initial_seed %% 10000
# [1] 8665
set.seed(8665)
Experimento=function(i)
{
ii=sample(dim(DF)[2],i,rep=FALSE)
DFtemp=DF[,ii]
CC=ClusterHC(DFtemp,dendrograma=FALSE,barplot=FALSE, Grups=Type,colores=colors)
c(Encerts(CC$tabla,table(Type)),Sep(CC))
}
n=250
F.SS=c()
for (i in 10:650) #10
{
print(i)
EE=replicate(n,Experimento(i))
EE.2=EE[2,which(EE[1,]==0)]
if(length(which(EE[1,]==0))>0){
XX=replicate(1000,mean(sample(EE.2,n,replace=TRUE)))
F.SS=rbind(F.SS, c(i,
length(EE.2)/250,
mean(EE.2),
quantile(XX,0.025),
quantile(XX,0.975)
))}
else {
F.SS=rbind(F.SS, c(i,
0,
NA,
NA,
NA))
}
}
colnames(F.SS)=c("n","Proportion","Avg. sep.","Q_0.025","Q_0.975")
saveRDS(F.SS, file="Fraccio.Separadors.CYM.T0vsT1.RData")
F.SS=readRDS("Fraccio.Separadors.CYM.T0vsT1.RData")
The first plot shows, for each \(n\) up to 500, the proportion of samples of size \(n\) that perfectly classify CYM_T0 and CYM_T1 (black curve), along with the 95% Clopper–Pearson confidence interval for that proportion (red curves). We used smoothing splines to smooth the curves.
F.SS_prop.Q_0.025=epitools::binom.exact(F.SS[,2]*250,250)$lower
F.SS_prop.Q_0.975=epitools::binom.exact(F.SS[,2]*250,250)$upper
fit = smooth.spline(F.SS[,1],F.SS[,2],cv=TRUE)
fit.low = smooth.spline(F.SS[,1],F.SS_prop.Q_0.025,cv=TRUE)
fit.up = smooth.spline(F.SS[,1],F.SS_prop.Q_0.975,cv=TRUE)
plot(F.SS[,1:2],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Proportion",main="Proportion of perfect classifiers",ylim=c(0,0.1),xlim=c(0,650),xaxp=c(0,650,13),yaxp=c(0,0.1,10))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)
The second plot shows, for each \(n\) up to 600, the estimated mean separation obtained with a sample of size \(n\) that perfectly classifies CYM_T0 and CYM_T1 (black curve), along with the 95% confidence interval of this mean separation obtained via bootstrap (red curve).
fit = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),3],cv=TRUE)
fit.low = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),4],cv=TRUE)
fit.up = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),5],cv=TRUE)
plot(F.SS[,c(1,3)],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Separation",main="Average separation for perfect classifiers",xlim=c(0,650),xaxp=c(0,650,13),yaxp=c(0,1.5,15))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)
`
LRS=explore_logratios(DF,Type)
saveRDS(LRS, file="LRS_CYM_T0T1.RData")
LRS=readRDS("LRS_CYM_T0T1.RData")
assoc=rep(0,dim(DF)[2]-4)
for (m in 5:dim(DF)[2]){
assoc[m-4]=sum(LRS$`association log-ratio with y`[1:m,1:m])/m^2
}
plot(5:dim(DF)[2],assoc,type="b",xlab="m",ylab="Average goodness of the m most relevant." ,pch=20,cex=0.5)
m=4+which.max(assoc)
impLR=sort(as.numeric(LRS$`order of importance`[1:m]))
HC=ClusterHC(DF[,impLR],barplot=FALSE,Grups=Type,colores=colors)
LR.max=c(length(impLR),Sep(HC))
The maximum is attained at the set of 169 most logratio-relevant taxa.
They perfectly separate the two sample types. The separation between groups is 2.46.
Significant=data.frame(
OTUs=paste("OTU",1:n.OTUs.original,sep=""),
taxa=taxa.original,
Log_ratio.max=Indicador(impLR))
coda_glmnet)coda.glmnet=coda_glmnet(DF,Type,showPlots = FALSE)
## [1] "No variables are selected. The prediction and the signature plots are not displayed."
coda.glmnet$`signature plot`
## NULL
coda_glmnet does not find taxa with significant differences. We add random replicates. We test with 3, 5, and 10 replicates.
> initial_seed=as.integer(Sys.time())
> print (initial_seed)
# [1] 1717709647
initial_seed %% 10000
# [1] 9647
# 3 replicates
set.seed(9647)
M=3
repls=aldexCesc.clr(t(DF), conds=Type, mc.samples=M)@analysisData
## [1] "operating in serial mode"
repls=t(cbind(repls$CYM_T0_1,repls$CYM_T0_2,repls$CYM_T0_3,repls$CYM_T1_1,repls$CYM_T1_2,repls$CYM_T1_3))
attr(repls,"dimnames")[[2]]=NULL
colnames(repls)=colnames(DF)
row.names(repls)=c(paste("CYM_T0_1",1:M,sep="_"),
paste("CYM_T0_2",1:M,sep="_"),
paste("CYM_T0_3",1:M,sep="_"),
paste("CYM_T1_1",1:M,sep="_"),
paste("CYM_T1_2",1:M,sep="_"),
paste("CYM_T1_3",1:M,sep="_"))
mostres=rbind(repls,easyCODA::CLR(DF)$LR)
rm(repls)
Type_ext=as.factor(c(rep("CYM_T0", 3*M),rep("CYM_T1", 3*M),rep("CYM_T0", 3),rep("CYM_T1", 3)))
colors_ext=c("blue","green")[Type_ext]
LogRat_mostres=logratios_matrix_clr(mostres)
LogRat_mostres=list(LogRat_mostres[[1]],LogRat_mostres[[2]])
saveRDS(LogRat_mostres, file="LogRat_mostres_CYM_T0T1_3copies.RData")
LogRat_mostres=readRDS("LogRat_mostres_CYM_T0T1_3copies.RData")
coda.glmnet.3=coda_glmnet_lr_bin(LogRat_mostres,Type_ext,DF,showPlots = FALSE)
impGLMNET_ext.3=coda.glmnet.3$taxa.num
HC=ClusterHC(DF[,impGLMNET_ext.3],dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)
glmnet.ext.3=c(length(impGLMNET_ext.3),Sep(HC))
With 3 replicates, 62 significant taxa are found, perfectly separating the two groups, with a separation of 2.05.
Significant=cbind(Significant,
glmnet_sign.ext.3.T0=Indicador(coda.glmnet.5$taxa.num[coda.glmnet.5$`log-contrast coefficients`>0]),
glmnet_sign.ext.3.T1=Indicador(coda.glmnet.5$taxa.num[coda.glmnet.5$`log-contrast coefficients`<0]))
# 5 replicates
set.seed(9647)
M=5
repls=aldexCesc.clr(t(DF), conds=Type, mc.samples=M)@analysisData
## [1] "operating in serial mode"
repls=t(cbind(repls$CYM_T0_1,repls$CYM_T0_2,repls$CYM_T0_3,repls$CYM_T1_1,repls$CYM_T1_2,repls$CYM_T1_3))
attr(repls,"dimnames")[[2]]=NULL
colnames(repls)=colnames(DF)
row.names(repls)=c(paste("CYM_T0_1",1:M,sep="_"),
paste("CYM_T0_2",1:M,sep="_"),
paste("CYM_T0_3",1:M,sep="_"),
paste("CYM_T1_1",1:M,sep="_"),
paste("CYM_T1_2",1:M,sep="_"),
paste("CYM_T1_3",1:M,sep="_"))
mostres=rbind(repls,easyCODA::CLR(DF)$LR)
rm(repls)
Type_ext=as.factor(c(rep("CYM_T0", 3*M),rep("CYM_T1", 3*M),rep("CYM_T0", 3),rep("CYM_T1", 3)))
colors_ext=c("blue","green")[Type_ext]
LogRat_mostres=logratios_matrix_clr(mostres)
LogRat_mostres=list(LogRat_mostres[[1]],LogRat_mostres[[2]])
saveRDS(LogRat_mostres, file="LogRat_mostres_CYM_T0T1_5copies.RData")
LogRat_mostres=readRDS("LogRat_mostres_CYM_T0T1_5copies.RData")
coda.glmnet.5=coda_glmnet_lr_bin(LogRat_mostres,Type_ext,DF,showPlots = FALSE)
impGLMNET_ext.5=coda.glmnet.5$taxa.num
HC=ClusterHC(DF[,impGLMNET_ext.5],dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)
glmnet.ext.5=c(length(impGLMNET_ext.5),Sep(HC))
With 5 replicates, 74 significant taxa are found, perfectly separating the two groups, with a separation of 1.36.
Significant=cbind(Significant,
glmnet_sign.ext.5.T0=Indicador(coda.glmnet.5$taxa.num[coda.glmnet.5$`log-contrast coefficients`>0]),
glmnet_sign.ext.5.T1=Indicador(coda.glmnet.5$taxa.num[coda.glmnet.5$`log-contrast coefficients`<0]))
# 10 replicates
set.seed(9647)
M=10
repls=aldexCesc.clr(t(DF), conds=Type, mc.samples=M)@analysisData
## [1] "operating in serial mode"
repls=t(cbind(repls$CYM_T0_1,repls$CYM_T0_2,repls$CYM_T0_3,repls$CYM_T1_1,repls$CYM_T1_2,repls$CYM_T1_3))
attr(repls,"dimnames")[[2]]=NULL
colnames(repls)=colnames(DF)
row.names(repls)=c(paste("CYM_T0_1",1:M,sep="_"),
paste("CYM_T0_2",1:M,sep="_"),
paste("CYM_T0_3",1:M,sep="_"),
paste("CYM_T1_1",1:M,sep="_"),
paste("CYM_T1_2",1:M,sep="_"),
paste("CYM_T1_3",1:M,sep="_"))
mostres=rbind(repls,easyCODA::CLR(DF)$LR)
rm(repls)
Type_ext=as.factor(c(rep("CYM_T0", 3*M),rep("CYM_T1", 3*M),rep("CYM_T0", 3),rep("CYM_T1", 3)))
colors_ext=c("blue","green")[Type_ext]
LogRat_mostres=logratios_matrix_clr(mostres)
LogRat_mostres=list(LogRat_mostres[[1]],LogRat_mostres[[2]])
saveRDS(LogRat_mostres, file="LogRat_mostres_CYM_T0T1_10copies.RData")
LogRat_mostres=readRDS("LogRat_mostres_CYM_T0T1_10copies.RData")
coda.glmnet.10=coda_glmnet_lr_bin(LogRat_mostres,Type_ext,DF,showPlots = FALSE)
impGLMNET_ext.10=coda.glmnet.10$taxa.num
HC=ClusterHC(DF[,impGLMNET_ext.10],dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)
glmnet.ext.10=c(length(impGLMNET_ext.10),Sep(HC))
With 10 replicates, 67 significant taxa are found, perfectly separating the two groups, with a separation of 1.26.
Significant=cbind(Significant,
glmnet_sign.ext.10.T0=Indicador(coda.glmnet.10$taxa.num[coda.glmnet.10$`log-contrast coefficients`>0]),
glmnet_sign.ext.10.T1=Indicador(coda.glmnet.10$taxa.num[coda.glmnet.10$`log-contrast coefficients`<0]))
> initial_seed=as.integer(Sys.time())
> print (initial_seed)
# [1] 1717738742
initial_seed %% 10000
# [1] 8742
set.seed(8742)
repl_kw=aldexCesc.clr(t(DF), conds=Type, mc.samples=1000, verbose=FALSE)
aldex_kw=aldex.kw(repl_kw, verbose=FALSE)
#
repl_glm=aldexCesc.clr(t(DF), conds=model.matrix(~Type,data.frame(Type)), mc.samples=1000, verbose=FALSE)
aldex_glm=aldex.glm(repl_glm, model.matrix(~Type,data.frame(Type)))
#'
saveRDS(aldex_kw, file="aldex_kw_CYM_T0T1.RData")
saveRDS(aldex_glm, file="aldex_glm_CYM_T0T1.RData")
p.valores_kw=readRDS("aldex_kw_CYM_T0T1.RData")
length(p.valores_kw[p.valores_kw[,2]<0.05,2])
## [1] 0
length(p.valores_kw[p.valores_kw[,4]<0.05,2])
## [1] 88
length(p.valores_kw[p.valores_kw[,4]<0.01,4])
## [1] 6
There are no OTUs with an adjusted Kruskal–Wallis p-value < 0.05. There are 66 OTUs with an unadjusted p-value < 0.05, but they do not separate the samples well:
sep.kw.noadjust.0.05=QQ.HC.noadjust(p.valores_kw,DF,Type,q=0.05,1,BP=FALSE)
Significant=cbind(Significant,
Aldex.kw.noadjust.0.05=Indicador(sep.kw.noadjust.0.05))
The significant taxa identified by simper do not classify the samples well:
DF.prop=DF.0.original/rowSums(DF.0.original)
SMP.prop=simper(DF.prop,group=Type)
indexos_SMP=SMP.prop$CYM_T0_CYM_T1$ord[which(SMP.prop$CYM_T0_CYM_T1$cusum<=0.7)]
OTUs_SMP=sort(attr(SMP.prop$CYM_T0_CYM_T1$cusum[which(SMP.prop$CYM_T0_CYM_T1$cusum<=0.7)],"names"))
OTUs_SMP.good=setdiff(OTUs_SMP,Unics$OTUs)
HC=ClusterHC(DF[,OTUs_SMP.good],dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)
Significant=cbind(Significant,
t(DF.prop))
write.csv2(Significant,"OTU_Significativos_CYM_T0T1.csv",row.names=FALSE)
Resultados=rbind(
LR.max,
glmnet.ext.3,
glmnet.ext.5,
glmnet.ext.10)
row.names(Resultados)=c(
"LR",
"glmnet.ext.3",
"glmnet.ext.5",
"glmnet.ext.10"
)
colnames(Resultados)=c("Size","Separation")
par(opar)
fit = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),3],cv=TRUE)
fit.low = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),4],cv=TRUE)
fit.up = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),5],cv=TRUE)
plot(F.SS[,c(1,3)],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Separation",main="Average separation for perfect classifiers",xlim=c(0,650),xaxp=c(0,650,13),ylim=c(0,max(Resultados[,2])+0.1))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)
points(Resultados,col="blue",type="h")
points(Resultados,col="blue",pch=20)
text(Resultados[,1],Resultados[,2]+0.01,col="blue",labels=row.names(Resultados),cex=0.75)
We intersect the OTUs identified as significant according to LR and coda_glmnet with 3 replicates.
Significant.capat=Significant[,c(1,2,3,4,5)]
Significant.capat=cbind(Significant.capat,Cuánto=rowSums(Significant.capat[,3:dim(Significant.capat)[2]]))
Significant.capat=Significant.capat[Significant.capat$Cuánto==2,]
DF.Imp=DF[,Significant.capat[,1]]
HC=ClusterHC(DF.Imp,dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)
There are 1 taxa. They are:
Significant.capat[,1:2]%>%
kbl() %>%
kable_styling()
| OTUs | taxa | |
|---|---|---|
| OTU235 | OTU235 | d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Thiomicrospirales;f__Thiomicrospiraceae;g__endosymbionts;s__uncultured_bacterium |
The separation between groups is 0.98.
We update the plot with the separations:
intersección=c(dim(Significant.capat)[1],Sep(HC))
Resultados=rbind(Resultados,intersección)
row.names(Resultados)=c(
"LR",
"glmnet.ext.3",
"glmnet.ext.5",
"glmnet.ext.10",
"intersection")
colnames(Resultados)=c("Size","Separation")
par(opar)
fit = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),3],cv=TRUE)
fit.low = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),4],cv=TRUE)
fit.up = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),5],cv=TRUE)
plot(F.SS[,c(1,3)],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Separation",main="Average separation for perfect classifiers",ylim=c(0,2.25),xlim=c(0,250),xaxp=c(0,250,10),yaxp=c(0,2.5,10))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)
points(Resultados,col="blue",type="h")
points(Resultados,col="blue",pch=20)
text(Resultados[,1],Resultados[,2]+0.02,col="blue",labels=row.names(Resultados),cex=0.75)
abline(h=2.47,col="green")
text(20,2.6,col="green",labels="Global",cex=0.75)
Resultados %>%
kbl() %>%
kable_styling()